diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5f5e1a8128..56f0d7ca1d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1455,8 +1455,8 @@ if(WITH_WEBP OR HAVE_WEBP)
 endif()
 
 if(WITH_AVIF OR HAVE_AVIF)
-  if(AVIF_VERSION)
-    status("    AVIF:" AVIF_FOUND THEN "${AVIF_LIBRARY} (ver ${AVIF_VERSION})" ELSE "NO")
+  if(libavif_VERSION)
+    status("    AVIF:" AVIF_FOUND THEN "${AVIF_LIBRARY} (ver ${libavif_VERSION})" ELSE "NO")
   else()
     status("    AVIF:" AVIF_FOUND THEN "${AVIF_LIBRARY}" ELSE "NO")
   endif()
@@ -1852,6 +1852,7 @@ if(BUILD_opencv_python3)
   else()
     status("    Libraries:"   HAVE_opencv_python3  THEN  "${PYTHON3_LIBRARIES}"                                      ELSE NO)
   endif()
+  status("    Limited API:" PYTHON3_LIMITED_API THEN "YES (ver ${PYTHON3_LIMITED_API_VERSION})"                    ELSE NO)
   status("    numpy:"         PYTHON3_NUMPY_INCLUDE_DIRS THEN "${PYTHON3_NUMPY_INCLUDE_DIRS} (ver ${PYTHON3_NUMPY_VERSION})" ELSE "NO (Python3 wrappers can not be generated)")
   status("    install path:"  HAVE_opencv_python3  THEN "${__INSTALL_PATH_PYTHON3}"                            ELSE "-")
 endif()
diff --git a/README.md b/README.md
index 25c984e1f2..4377b666d1 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,5 @@
 ## OpenCV: Open Source Computer Vision Library
 
-### Keep OpenCV Free
-
-OpenCV is raising funds to keep the library free for everyone, and we need the support of the entire community to do it. [Donate to OpenCV on IndieGoGo](http://igg.me/at/opencv5) before the campaign ends on December 16 to show your support.
 
 ### Resources
 
@@ -13,6 +10,7 @@ OpenCV is raising funds to keep the library free for everyone, and we need the s
   * previous forum (read only): <http://answers.opencv.org>
 * Issue tracking: <https://github.com/opencv/opencv/issues>
 * Additional OpenCV functionality: <https://github.com/opencv/opencv_contrib>
+* Donate to OpenCV: <https://opencv.org/support/>
 
 
 ### Contributing
diff --git a/cmake/OpenCVCompilerOptimizations.cmake b/cmake/OpenCVCompilerOptimizations.cmake
index ea577e4a1c..ff0e40c666 100644
--- a/cmake/OpenCVCompilerOptimizations.cmake
+++ b/cmake/OpenCVCompilerOptimizations.cmake
@@ -484,21 +484,19 @@ macro(ocv_check_compiler_optimization OPT)
 endmacro()
 
 macro(ocv_cpu_aarch64_baseline_merge_feature_options FEATURE_NAME_LIST FLAG_STRING COMMON_OPTION)
-  if(NOT MSVC)
-    unset(_POSTFIX)
-    # Check each feature option
-    foreach(OPT IN LISTS ${FEATURE_NAME_LIST})
-      string(FIND "${${FLAG_STRING}}" "${CPU_${OPT}_FLAGS_ON}" OPT_FOUND)
-      if(NOT ${OPT_FOUND} EQUAL -1)
-        string(REPLACE "${COMMON_OPTION}" "" TRAILING_PART "${CPU_${OPT}_FLAGS_ON}")
-        string(APPEND _POSTFIX "${TRAILING_PART}")
-        string(REPLACE " ${CPU_${OPT}_FLAGS_ON}" "" ${FLAG_STRING} ${${FLAG_STRING}})
-      endif()
-    endforeach()
-    # If more than one option found, merge them
-    if(NOT "x${_POSTFIX}" STREQUAL "x")
-      set(${FLAG_STRING} "${${FLAG_STRING}} ${COMMON_OPTION}${_POSTFIX}")
+  unset(_POSTFIX)
+  # Check each feature option
+  foreach(OPT IN LISTS ${FEATURE_NAME_LIST})
+    string(FIND "${${FLAG_STRING}}" "${CPU_${OPT}_FLAGS_ON}" OPT_FOUND)
+    if(NOT ${OPT_FOUND} EQUAL -1)
+      string(REPLACE "${COMMON_OPTION}" "" TRAILING_PART "${CPU_${OPT}_FLAGS_ON}")
+      string(APPEND _POSTFIX "${TRAILING_PART}")
+      string(REPLACE " ${CPU_${OPT}_FLAGS_ON}" "" ${FLAG_STRING} ${${FLAG_STRING}})
     endif()
+  endforeach()
+  # If more than one option found, merge them
+  if(NOT "x${_POSTFIX}" STREQUAL "x")
+    set(${FLAG_STRING} "${${FLAG_STRING}} ${COMMON_OPTION}${_POSTFIX}")
   endif()
 endmacro()
 
@@ -596,10 +594,12 @@ foreach(OPT ${CPU_KNOWN_OPTIMIZATIONS})
 endforeach()
 
 if(AARCH64)
+  if(NOT MSVC)
     # Define the list of NEON options to check
     set(NEON_OPTIONS_LIST NEON_DOTPROD NEON_FP16 NEON_BF16)
     set(BASE_ARCHITECTURE "-march=armv8.2-a")
     ocv_cpu_aarch64_baseline_merge_feature_options(NEON_OPTIONS_LIST CPU_BASELINE_FLAGS ${BASE_ARCHITECTURE})
+  endif()
 endif()
 
 foreach(OPT ${CPU_BASELINE_REQUIRE})
diff --git a/cmake/OpenCVCompilerOptions.cmake b/cmake/OpenCVCompilerOptions.cmake
index 427189c079..f23bb13dc5 100644
--- a/cmake/OpenCVCompilerOptions.cmake
+++ b/cmake/OpenCVCompilerOptions.cmake
@@ -1,13 +1,6 @@
 if("${CMAKE_CXX_COMPILER};${CMAKE_C_COMPILER};${CMAKE_CXX_COMPILER_LAUNCHER}" MATCHES "ccache")
-  set(CMAKE_COMPILER_IS_CCACHE 1)  # TODO: FIXIT Avoid setting of CMAKE_ variables
   set(OPENCV_COMPILER_IS_CCACHE 1)
 endif()
-function(access_CMAKE_COMPILER_IS_CCACHE)
-  if(NOT OPENCV_SUPPRESS_DEPRECATIONS)
-    message(WARNING "DEPRECATED: CMAKE_COMPILER_IS_CCACHE is replaced to OPENCV_COMPILER_IS_CCACHE.")
-  endif()
-endfunction()
-variable_watch(CMAKE_COMPILER_IS_CCACHE access_CMAKE_COMPILER_IS_CCACHE)
 if(ENABLE_CCACHE AND NOT OPENCV_COMPILER_IS_CCACHE)
   # This works fine with Unix Makefiles and Ninja generators
   find_host_program(CCACHE_PROGRAM ccache)
@@ -391,7 +384,7 @@ endif()
 
 # Apply "-Wl,--no-undefined" linker flags: https://github.com/opencv/opencv/pull/21347
 if(NOT OPENCV_SKIP_LINK_NO_UNDEFINED)
-  if(UNIX AND (NOT APPLE OR NOT CMAKE_VERSION VERSION_LESS "3.2"))
+  if(UNIX AND ((NOT APPLE OR NOT CMAKE_VERSION VERSION_LESS "3.2") AND NOT CMAKE_SYSTEM_NAME MATCHES "OpenBSD"))
     set(_option "-Wl,--no-undefined")
     set(_saved_CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}")
     set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${_option}")  # requires CMake 3.2+ and CMP0056
diff --git a/cmake/OpenCVDetectCUDA.cmake b/cmake/OpenCVDetectCUDA.cmake
index e0c539b90a..06998400d7 100644
--- a/cmake/OpenCVDetectCUDA.cmake
+++ b/cmake/OpenCVDetectCUDA.cmake
@@ -136,11 +136,11 @@ macro(ocv_check_windows_crt_linkage)
     cmake_policy(GET CMP0091 MSVC_RUNTIME_SET_BY_ABSTRACTION)
     if(MSVC_RUNTIME_SET_BY_ABSTRACTION STREQUAL "NEW")
       if(NOT BUILD_SHARED_LIBS AND BUILD_WITH_STATIC_CRT)
-        set(CMAKE_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE} " /MT")
-        set(CMAKE_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG} " /MTd")
+        set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /MT")
+        set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /MTd")
       else()
-        set(CMAKE_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE} " /MD")
-        set(CMAKE_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG} " /MDd")
+        set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /MD")
+        set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /MDd")
       endif()
     endif()
   endif()
diff --git a/cmake/OpenCVDetectPython.cmake b/cmake/OpenCVDetectPython.cmake
index e798789b51..60ef78b7fe 100644
--- a/cmake/OpenCVDetectPython.cmake
+++ b/cmake/OpenCVDetectPython.cmake
@@ -270,6 +270,18 @@ find_python("${OPENCV_PYTHON3_VERSION}" "${MIN_VER_PYTHON3}" PYTHON3_LIBRARY PYT
     PYTHON3_INCLUDE_DIR PYTHON3_INCLUDE_DIR2 PYTHON3_PACKAGES_PATH
     PYTHON3_NUMPY_INCLUDE_DIRS PYTHON3_NUMPY_VERSION)
 
+# Problem in numpy >=1.15 <1.17
+OCV_OPTION(PYTHON3_LIMITED_API "Build with Python Limited API (not available with numpy >=1.15 <1.17)" NO
+           VISIBLE_IF PYTHON3_NUMPY_VERSION VERSION_LESS "1.15" OR NOT PYTHON3_NUMPY_VERSION VERSION_LESS "1.17")
+if(PYTHON3_LIMITED_API)
+  set(_default_ver "0x03060000")
+  if(PYTHON3_VERSION_STRING VERSION_LESS "3.6")
+    # fix for older pythons
+    set(_default_ver "0x030${PYTHON3_VERSION_MINOR}0000")
+  endif()
+  set(PYTHON3_LIMITED_API_VERSION ${_default_ver} CACHE STRING "Minimal Python version for Limited API")
+endif()
+
 if(PYTHON_DEFAULT_EXECUTABLE)
     set(PYTHON_DEFAULT_AVAILABLE "TRUE")
 elseif(PYTHON3_EXECUTABLE AND PYTHON3INTERP_FOUND)
diff --git a/cmake/OpenCVFindCANN.cmake b/cmake/OpenCVFindCANN.cmake
index 913c1887e7..36d160d0f4 100644
--- a/cmake/OpenCVFindCANN.cmake
+++ b/cmake/OpenCVFindCANN.cmake
@@ -57,6 +57,18 @@ if(CANN_INSTALL_DIR)
         set(HAVE_CANN OFF)
         return()
     endif()
+
+    #  * libacl_dvpp_mpi.so
+    set(libacl_dvpp_mpi "${CANN_INSTALL_DIR}/lib64")
+    find_library(found_libacldvppmpi NAMES acl_dvpp_mpi PATHS ${libacl_dvpp_mpi} NO_DEFAULT_PATH)
+    if(found_libacldvppmpi)
+        set(libacl_dvpp_mpi ${found_libacldvppmpi})
+        message(STATUS "CANN: libacl_dvpp_mpi.so is found at ${libacl_dvpp_mpi}")
+    else()
+        message(STATUS "CANN: Missing libacl_dvpp_mpi.so. Turning off HAVE_CANN")
+        set(HAVE_CANN OFF)
+        return()
+    endif()
     #  * libgraph.so
     set(lib_graph "${CANN_INSTALL_DIR}/compiler/lib64")
     find_library(found_lib_graph NAMES graph PATHS ${lib_graph} NO_DEFAULT_PATH)
@@ -105,6 +117,7 @@ if(CANN_INSTALL_DIR)
     list(APPEND libs_cann ${lib_opsproto})
     list(APPEND libs_cann ${lib_graph})
     list(APPEND libs_cann ${lib_ge_compiler})
+    list(APPEND libs_cann ${libacl_dvpp_mpi})
 
     #  * lib_graph_base.so
     if(NOT CANN_VERSION_BELOW_6_3_ALPHA002)
diff --git a/cmake/android/android_gradle_projects.cmake b/cmake/android/android_gradle_projects.cmake
index 7c80d5777e..90a13a967e 100644
--- a/cmake/android/android_gradle_projects.cmake
+++ b/cmake/android/android_gradle_projects.cmake
@@ -89,15 +89,11 @@ else()
   ocv_update(OPENCV_ANDROID_NAMESPACE_DECLARATION "")
 endif()
 
-# set android gradle java version in build.gradle and set aidl config
 if(NOT (ANDROID_GRADLE_PLUGIN_VERSION VERSION_LESS "8.0.0"))
   # AGP-8.0 requires a minimum JDK version of JDK17
   ocv_update(ANDROID_GRADLE_JAVA_VERSION_INIT "17")
-  # Enable aidl configuration for OpenCV compile with AGP-8.0
-  ocv_update(ANDROID_GRADLE_BUILD_FEATURE_AIDL "buildFeatures { aidl true }")
 else()
   ocv_update(ANDROID_GRADLE_JAVA_VERSION_INIT "1_8")
-  ocv_update(ANDROID_GRADLE_BUILD_FEATURE_AIDL "")
 endif()
 
 set(ANDROID_GRADLE_JAVA_VERSION "${ANDROID_GRADLE_JAVA_VERSION_INIT}" CACHE STRING "Android Gradle Java version")
diff --git a/doc/tutorials/core/how_to_use_OpenCV_parallel_for_/how_to_use_OpenCV_parallel_for_.markdown b/doc/tutorials/core/how_to_use_OpenCV_parallel_for_/how_to_use_OpenCV_parallel_for_.markdown
index ae24fbb42f..389181de3b 100644
--- a/doc/tutorials/core/how_to_use_OpenCV_parallel_for_/how_to_use_OpenCV_parallel_for_.markdown
+++ b/doc/tutorials/core/how_to_use_OpenCV_parallel_for_/how_to_use_OpenCV_parallel_for_.markdown
@@ -9,6 +9,9 @@ How to use the OpenCV parallel_for_ to parallelize your code {#tutorial_how_to_u
 | -: | :- |
 | Compatibility | OpenCV >= 3.0 |
 
+
+@note See also C++ lambda usage with parallel for in [tuturial](@ref tutorial_how_to_use_OpenCV_parallel_for_new).
+
 Goal
 ----
 
@@ -20,7 +23,7 @@ If you want more information about multithreading, you will have to refer to a r
 to remain simple.
 
 Precondition
-----
+------------
 
 The first precondition is to have OpenCV built with a parallel framework.
 In OpenCV 3.2, the following parallel frameworks are available in that order:
@@ -50,7 +53,7 @@ We will use the example of drawing a Mandelbrot set to show how from a regular s
 the code to parallelize the computation.
 
 Theory
------------
+------
 
 The Mandelbrot set definition has been named in tribute to the mathematician Benoit Mandelbrot by the mathematician
 Adrien Douady. It has been famous outside of the mathematics field as the image representation is an example of a
@@ -69,7 +72,7 @@ Here, we will just introduce the formula to draw the Mandelbrot set (from the me
 > \f[\limsup_{n\to\infty}|z_{n+1}|\leqslant2\f]
 
 Pseudocode
------------
+----------
 
 A simple algorithm to generate a representation of the Mandelbrot set is called the
 ["escape time algorithm"](https://en.wikipedia.org/wiki/Mandelbrot_set#Escape_time_algorithm).
@@ -110,10 +113,10 @@ On this figure, we recall that the real part of a complex number is on the x-axi
 You can see that the whole shape can be repeatedly visible if we zoom at particular locations.
 
 Implementation
------------
+--------------
 
 Escape time algorithm implementation
---------------------------
+------------------------------------
 
 @snippet how_to_use_OpenCV_parallel_for_.cpp mandelbrot-escape-time-algorithm
 
@@ -121,7 +124,7 @@ Here, we used the [`std::complex`](http://en.cppreference.com/w/cpp/numeric/comp
 complex number. This function performs the test to check if the pixel is in set or not and returns the "escaped" iteration.
 
 Sequential Mandelbrot implementation
---------------------------
+------------------------------------
 
 @snippet how_to_use_OpenCV_parallel_for_.cpp mandelbrot-sequential
 
@@ -149,7 +152,7 @@ The green curve corresponds to a simple linear scale transformation, the blue on
 and you can observe how the lowest values will be boosted when looking at the slope at these positions.
 
 Parallel Mandelbrot implementation
---------------------------
+----------------------------------
 
 When looking at the sequential implementation, we can notice that each pixel is computed independently. To optimize the
 computation, we can perform multiple pixel calculations in parallel, by exploiting the multi-core architecture of modern
@@ -181,7 +184,7 @@ C++ 11 standard allows to simplify the parallel implementation by get rid of the
 @snippet how_to_use_OpenCV_parallel_for_.cpp mandelbrot-parallel-call-cxx11
 
 Results
------------
+-------
 
 You can find the full tutorial code [here](https://github.com/opencv/opencv/blob/5.x/samples/cpp/tutorial_code/core/how_to_use_OpenCV_parallel_for_/how_to_use_OpenCV_parallel_for_.cpp).
 The performance of the parallel implementation depends of the type of CPU you have. For instance, on 4 cores / 8 threads
diff --git a/doc/tutorials/introduction/android_binary_package/dev_with_OCV_on_Android.markdown b/doc/tutorials/introduction/android_binary_package/dev_with_OCV_on_Android.markdown
index 68d7ab3644..331c6bfb51 100644
--- a/doc/tutorials/introduction/android_binary_package/dev_with_OCV_on_Android.markdown
+++ b/doc/tutorials/introduction/android_binary_package/dev_with_OCV_on_Android.markdown
@@ -18,7 +18,7 @@ This tutorial assumes you have the following installed and configured:
 -   Android Studio
 -   JDK
 -   Android SDK and NDK
--   OpenCV for Android SDK from official [release page on Github](https://github.com/opencv/opencv/releases)
+-   Optional: OpenCV for Android SDK from official [release page on Github](https://github.com/opencv/opencv/releases)
     or [SourceForge](https://sourceforge.net/projects/opencvlibrary/). Advanced: as alternative the SDK may be
     built from source code by [instruction on wiki](https://github.com/opencv/opencv/wiki/Custom-OpenCV-Android-SDK-and-AAR-package-build).
 
@@ -26,8 +26,9 @@ If you need help with anything of the above, you may refer to our @ref tutorial_
 
 If you encounter any error after thoroughly following these steps, feel free to contact us via OpenCV [forum](https://forum.opencv.org). We'll do our best to help you out.
 
-Hello OpenCV sample
--------------------
+
+Hello OpenCV sample with SDK
+----------------------------
 
 In this section we're gonna create a simple app that does nothing but OpenCV loading. In next section we'll extend it to support camera.
 
@@ -75,11 +76,10 @@ In addition to this instruction you can use some video guide, for example [this
     @endcode
     The fix was found [here](https://stackoverflow.com/questions/73225714/import-opencv-sdk-to-android-studio-chipmunk)
 
-6. OpenCV project uses `aidl` and `buildConfig` features. Please enable them in
+6. OpenCV project uses `buildConfig` feature. Please enable it in
    `MyApplication/OpenCV/build.gradle` file to `android` block:
     @code{.gradle}
     buildFeatures{
-        aidl true
         buildConfig true
     }
 
@@ -115,6 +115,43 @@ In addition to this instruction you can use some video guide, for example [this
 
     ![](images/run_app.png)
 
+Hello OpenCV sample with Maven Central
+--------------------------------------
+
+Since OpenCV 4.9.0 OpenCV for Android package is available with Maven Central and may be installed
+automatically as Gradle dependency. In this section we're gonna create a simple app that does nothing
+but OpenCV loading with Maven Central.
+
+1. Open Android Studio and create empty project by choosing ***Empty Views Activity***
+
+    ![](images/create_empty_project.png)
+
+2. Setup the project:
+    - Choose ***Java*** language
+    - Choose ***Groovy DSL*** build configuration language
+    - Choose ***Minumum SDK*** with the version number not less than OpenCV supports. For 4.9.0 minimal SDK version is 21.
+
+    ![](images/setup_project.png)
+
+3. Edit `build.gradle` and add OpenCV library to Dependencies list like this:
+    @code{.gradle}
+    dependencies {
+        implementation 'org.opencv:opencv:4.9.0'
+    }
+    @endcode
+   `4.9.0` may be replaced by any version available as [official release](https://central.sonatype.com/artifact/org.opencv/opencv).
+
+4. Before using any OpenCV function you have to load the library first. If you application includes other
+   OpenCV-dependent native libraries you should load them ***after*** OpenCV initialization. Add the folowing
+   code to load the library at app start:
+    @snippet samples/android/tutorial-1-camerapreview/src/org/opencv/samples/tutorial1/Tutorial1Activity.java ocv_loader_init
+    Like this:
+    ![](images/sample_code.png)
+
+5. Choose a device to check the sample on and run the code by pressing `run` button
+
+    ![](images/run_app.png)
+
 Camera view sample
 ------------------
 
diff --git a/doc/tutorials/introduction/windows_install/windows_install.markdown b/doc/tutorials/introduction/windows_install/windows_install.markdown
index eabf31482f..2568592d0c 100644
--- a/doc/tutorials/introduction/windows_install/windows_install.markdown
+++ b/doc/tutorials/introduction/windows_install/windows_install.markdown
@@ -378,6 +378,9 @@ our OpenCV library that we use in our projects. Start up a command window and en
 
     setx OpenCV_DIR D:\OpenCV\build\x64\vc16     (suggested for Visual Studio 2019 - 64 bit Windows)
     setx OpenCV_DIR D:\OpenCV\build\x86\vc16     (suggested for Visual Studio 2019 - 32 bit Windows)
+
+    setx OpenCV_DIR D:\OpenCV\build\x64\vc17     (suggested for Visual Studio 2022 - 64 bit Windows)
+    setx OpenCV_DIR D:\OpenCV\build\x86\vc17     (suggested for Visual Studio 2022 - 32 bit Windows)
 @endcode
 Here the directory is where you have your OpenCV binaries (*extracted* or *built*). You can have
 different platform (e.g. x64 instead of x86) or compiler type, so substitute appropriate value.
diff --git a/modules/core/include/opencv2/core/async.hpp b/modules/core/include/opencv2/core/async.hpp
index 54560c7d00..98868a130b 100644
--- a/modules/core/include/opencv2/core/async.hpp
+++ b/modules/core/include/opencv2/core/async.hpp
@@ -7,10 +7,8 @@
 
 #include <opencv2/core/mat.hpp>
 
-#ifdef CV_CXX11
 //#include <future>
 #include <chrono>
-#endif
 
 namespace cv {
 
@@ -69,7 +67,6 @@ public:
 
     CV_WRAP bool valid() const CV_NOEXCEPT;
 
-#ifdef CV_CXX11
     inline AsyncArray(AsyncArray&& o) { p = o.p; o.p = NULL; }
     inline AsyncArray& operator=(AsyncArray&& o) CV_NOEXCEPT { std::swap(p, o.p); return *this; }
 
@@ -89,7 +86,6 @@ public:
     std::future<Mat> getFutureMat() const;
     std::future<UMat> getFutureUMat() const;
 #endif
-#endif
 
 
     // PImpl
diff --git a/modules/core/include/opencv2/core/cv_cpu_dispatch.h b/modules/core/include/opencv2/core/cv_cpu_dispatch.h
index de7b84b82a..8269fa6121 100644
--- a/modules/core/include/opencv2/core/cv_cpu_dispatch.h
+++ b/modules/core/include/opencv2/core/cv_cpu_dispatch.h
@@ -147,7 +147,7 @@
 #endif
 
 #if defined(__riscv) && defined(__riscv_vector) && defined(__riscv_vector_071)
-# include<riscv-vector.h>
+# include<riscv_vector.h>
 # define CV_RVV071 1
 #endif
 
diff --git a/modules/core/include/opencv2/core/cvdef.h b/modules/core/include/opencv2/core/cvdef.h
index 7f0790834c..7cf4627daa 100644
--- a/modules/core/include/opencv2/core/cvdef.h
+++ b/modules/core/include/opencv2/core/cvdef.h
@@ -476,6 +476,8 @@ Cv64suf;
 #define CV_WRAP_MAPPABLE(mappable)
 #define CV_WRAP_PHANTOM(phantom_header)
 #define CV_WRAP_DEFAULT(val)
+/* Indicates that the function parameter has filesystem path semantic */
+#define CV_WRAP_FILE_PATH
 
 /****************************************************************************************\
 *                                  Matrix type (Mat)                                     *
@@ -755,89 +757,44 @@ __CV_ENUM_FLAGS_BITWISE_XOR_EQ   (EnumType, EnumType)
 #endif
 
 
-/****************************************************************************************\
-*                      CV_NODISCARD attribute (deprecated, GCC only)                     *
-* DONT USE: use instead the standard CV_NODISCARD_STD macro above                        *
-*           this legacy method silently fails to issue warning until some version        *
-*           after gcc 6.3.0. Yet with gcc 7+ you can use the above standard method       *
-*           which makes this method useless. Don't use it.                               *
-* @deprecated use instead CV_NODISCARD_STD                                               *
-\****************************************************************************************/
-#ifndef CV_NODISCARD
-#  if defined(__GNUC__)
-#    define CV_NODISCARD __attribute__((__warn_unused_result__))
-#  elif defined(__clang__) && defined(__has_attribute)
-#    if __has_attribute(__warn_unused_result__)
-#      define CV_NODISCARD __attribute__((__warn_unused_result__))
-#    endif
-#  endif
-#endif
-#ifndef CV_NODISCARD
-#  define CV_NODISCARD /* nothing by default */
-#endif
-
-
 /****************************************************************************************\
 *                                    C++ 11                                              *
 \****************************************************************************************/
-#ifndef CV_CXX11
-#  if __cplusplus >= 201103L || (defined(_MSC_VER) && _MSC_VER >= 1800)
-#    define CV_CXX11 1
+#ifdef __cplusplus
+// MSVC was stuck at __cplusplus == 199711L for a long time, even where it supports C++11,
+// so check _MSC_VER instead. See:
+// <https://devblogs.microsoft.com/cppblog/msvc-now-correctly-reports-__cplusplus>
+#  if defined(_MSC_VER)
+#    if _MSC_VER < 1800
+#      error "OpenCV 4.x+ requires enabled C++11 support"
+#    endif
+#  elif __cplusplus < 201103L
+#    error "OpenCV 4.x+ requires enabled C++11 support"
 #  endif
-#else
-#  if CV_CXX11 == 0
-#    undef CV_CXX11
-#  endif
-#endif
-#ifndef CV_CXX11
-#  error "OpenCV 4.x+ requires enabled C++11 support"
 #endif
 
-#define CV_CXX_MOVE_SEMANTICS 1
-#define CV_CXX_MOVE(x) std::move(x)
-#define CV_CXX_STD_ARRAY 1
-#include <array>
+#ifndef CV_CXX11
+#  define CV_CXX11 1
+#endif
+
 #ifndef CV_OVERRIDE
 #  define CV_OVERRIDE override
 #endif
+
 #ifndef CV_FINAL
 #  define CV_FINAL final
 #endif
 
 #ifndef CV_NOEXCEPT
-#  if __cplusplus >= 201103L || (defined(_MSC_VER) && _MSC_VER >= 1900/*MSVS 2015*/)
-#    define CV_NOEXCEPT noexcept
-#  endif
-#endif
-#ifndef CV_NOEXCEPT
-#  define CV_NOEXCEPT
+#  define CV_NOEXCEPT noexcept
 #endif
 
 #ifndef CV_CONSTEXPR
-#  if __cplusplus >= 201103L || (defined(_MSC_VER) && _MSC_VER >= 1900/*MSVS 2015*/)
-#    define CV_CONSTEXPR constexpr
-#  endif
-#endif
-#ifndef CV_CONSTEXPR
-#  define CV_CONSTEXPR
+#  define CV_CONSTEXPR constexpr
 #endif
 
 // Integer types portability
-#ifdef OPENCV_STDINT_HEADER
-#include OPENCV_STDINT_HEADER
-#elif defined(__cplusplus)
-#if defined(_MSC_VER) && _MSC_VER < 1600 /* MSVS 2010 */
-namespace cv {
-typedef signed char int8_t;
-typedef unsigned char uint8_t;
-typedef signed short int16_t;
-typedef unsigned short uint16_t;
-typedef signed int int32_t;
-typedef unsigned int uint32_t;
-typedef signed __int64 int64_t;
-typedef unsigned __int64 uint64_t;
-}
-#elif defined(_MSC_VER) || __cplusplus >= 201103L
+#ifdef __cplusplus
 #include <cstdint>
 namespace cv {
 using std::int8_t;
@@ -849,19 +806,6 @@ using std::uint32_t;
 using std::int64_t;
 using std::uint64_t;
 }
-#else
-#include <stdint.h>
-namespace cv {
-typedef ::int8_t int8_t;
-typedef ::uint8_t uint8_t;
-typedef ::int16_t int16_t;
-typedef ::uint16_t uint16_t;
-typedef ::int32_t int32_t;
-typedef ::uint32_t uint32_t;
-typedef ::int64_t int64_t;
-typedef ::uint64_t uint64_t;
-}
-#endif
 #else // pure C
 #include <stdint.h>
 #endif
diff --git a/modules/core/include/opencv2/core/detail/async_promise.hpp b/modules/core/include/opencv2/core/detail/async_promise.hpp
index 6eb3fb52c1..c039ec046a 100644
--- a/modules/core/include/opencv2/core/detail/async_promise.hpp
+++ b/modules/core/include/opencv2/core/detail/async_promise.hpp
@@ -52,10 +52,8 @@ public:
     */
     void setException(const cv::Exception& exception);
 
-#ifdef CV_CXX11
     explicit AsyncPromise(AsyncPromise&& o) { p = o.p; o.p = NULL; }
     AsyncPromise& operator=(AsyncPromise&& o) CV_NOEXCEPT { std::swap(p, o.p); return *this; }
-#endif
 
 
     // PImpl
diff --git a/modules/core/include/opencv2/core/detail/exception_ptr.hpp b/modules/core/include/opencv2/core/detail/exception_ptr.hpp
index d98ffc40c6..a1a591e455 100644
--- a/modules/core/include/opencv2/core/detail/exception_ptr.hpp
+++ b/modules/core/include/opencv2/core/detail/exception_ptr.hpp
@@ -8,14 +8,8 @@
 #ifndef CV__EXCEPTION_PTR
 #  if defined(__ANDROID__) && defined(ATOMIC_INT_LOCK_FREE) && ATOMIC_INT_LOCK_FREE < 2
 #    define CV__EXCEPTION_PTR 0  // Not supported, details: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58938
-#  elif defined(CV_CXX11)
+#  else
 #    define CV__EXCEPTION_PTR 1
-#  elif defined(_MSC_VER)
-#    define CV__EXCEPTION_PTR (_MSC_VER >= 1600)
-#  elif defined(__clang__)
-#    define CV__EXCEPTION_PTR 0  // C++11 only (see above)
-#  elif defined(__GNUC__) && defined(__GXX_EXPERIMENTAL_CXX0X__)
-#    define CV__EXCEPTION_PTR (__GXX_EXPERIMENTAL_CXX0X__ > 0)
 #  endif
 #endif
 #ifndef CV__EXCEPTION_PTR
diff --git a/modules/core/include/opencv2/core/eigen.hpp b/modules/core/include/opencv2/core/eigen.hpp
index f176409cc3..231c6805c0 100644
--- a/modules/core/include/opencv2/core/eigen.hpp
+++ b/modules/core/include/opencv2/core/eigen.hpp
@@ -61,8 +61,7 @@
 #endif
 
 #if !defined(OPENCV_DISABLE_EIGEN_TENSOR_SUPPORT)
-#if EIGEN_WORLD_VERSION == 3 && EIGEN_MAJOR_VERSION >= 3 \
-    && defined(CV_CXX11) && defined(CV_CXX_STD_ARRAY)
+#if EIGEN_WORLD_VERSION == 3 && EIGEN_MAJOR_VERSION >= 3
 #include <unsupported/Eigen/CXX11/Tensor>
 #define OPENCV_EIGEN_TENSOR_SUPPORT 1
 #endif  // EIGEN_WORLD_VERSION == 3 && EIGEN_MAJOR_VERSION >= 3
diff --git a/modules/core/include/opencv2/core/hal/intrin_rvv071.hpp b/modules/core/include/opencv2/core/hal/intrin_rvv071.hpp
index 26f478feda..e34dbc01b4 100644
--- a/modules/core/include/opencv2/core/hal/intrin_rvv071.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_rvv071.hpp
@@ -19,7 +19,7 @@ namespace cv
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
 
 #define CV_SIMD128 1
-#define CV_SIMD128_64F 0
+#define CV_SIMD128_64F 1
 //////////// Types ////////////
 struct v_uint8x16
 {
@@ -32,11 +32,11 @@ struct v_uint8x16
                uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
     {
         uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
-        val = (vuint8m1_t)vle_v_u8m1((unsigned char*)v, 16);
+        val = (vuint8m1_t)vle8_v_u8m1((unsigned char*)v, 16);
     }
     uchar get0() const
     {
-        return vmv_x_s_u8m1_u8(val, 16);
+        return vmv_x_s_u8m1_u8(val);
     }
 
     vuint8m1_t val;
@@ -53,11 +53,11 @@ struct v_int8x16
                schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
     {
         schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
-        val = (vint8m1_t)vle_v_i8m1((schar*)v, 16);
+        val = (vint8m1_t)vle8_v_i8m1((schar*)v, 16);
     }
     schar get0() const
     {
-        return vmv_x_s_i8m1_i8(val, 16);
+        return vmv_x_s_i8m1_i8(val);
     }
 
     vint8m1_t val;
@@ -73,11 +73,11 @@ struct v_uint16x8
     v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
     {
         ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
-        val = (vuint16m1_t)vle_v_u16m1((unsigned short*)v, 8);
+        val = (vuint16m1_t)vle16_v_u16m1((unsigned short*)v, 8);
     }
     ushort get0() const
     {
-        return vmv_x_s_u16m1_u16(val, 8);
+        return vmv_x_s_u16m1_u16(val);
     }
 
     vuint16m1_t val;
@@ -93,11 +93,11 @@ struct v_int16x8
     v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
     {
         short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
-        val = (vint16m1_t)vle_v_i16m1((signed short*)v, 8);
+        val = (vint16m1_t)vle16_v_i16m1((signed short*)v, 8);
     }
     short get0() const
     {
-        return vmv_x_s_i16m1_i16(val, 8);
+        return vmv_x_s_i16m1_i16(val);
     }
 
     vint16m1_t val;
@@ -113,11 +113,11 @@ struct v_uint32x4
     v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
     {
         unsigned v[] = {v0, v1, v2, v3};
-        val = (vuint32m1_t)vle_v_u32m1((unsigned int*)v, 4);
+        val = (vuint32m1_t)vle32_v_u32m1((unsigned int*)v, 4);
     }
     unsigned get0() const
     {
-        return vmv_x_s_u32m1_u32(val, 4);
+        return vmv_x_s_u32m1_u32(val);
     }
 
     vuint32m1_t val;
@@ -133,11 +133,11 @@ struct v_int32x4
     v_int32x4(int v0, int v1, int v2, int v3)
     {
         int v[] = {v0, v1, v2, v3};
-        val = (vint32m1_t)vle_v_i32m1((signed int*)v, 4);
+        val = (vint32m1_t)vle32_v_i32m1((signed int*)v, 4);
     }
     int get0() const
     {
-        return vmv_x_s_i32m1_i32(val, 4);
+        return vmv_x_s_i32m1_i32(val);
     }
     vint32m1_t val;
 };
@@ -152,11 +152,11 @@ struct v_float32x4
     v_float32x4(float v0, float v1, float v2, float v3)
     {
         float v[] = {v0, v1, v2, v3};
-        val = (vfloat32m1_t)vle_v_f32m1((float*)v, 4);
+        val = (vfloat32m1_t)vle32_v_f32m1((float*)v, 4);
     }
     float get0() const
     {
-        return vfmv_f_s_f32m1_f32(val, 4);
+        return vfmv_f_s_f32m1_f32(val);
     }
     vfloat32m1_t val;
 };
@@ -171,11 +171,11 @@ struct v_uint64x2
     v_uint64x2(uint64 v0, uint64 v1)
     {
         uint64 v[] = {v0, v1};
-        val = (vuint64m1_t)vle_v_u64m1((unsigned long*)v, 2);
+        val = (vuint64m1_t)vle64_v_u64m1((unsigned long*)v, 2);
     }
     uint64 get0() const
     {
-        return vmv_x_s_u64m1_u64(val, 2);
+        return vmv_x_s_u64m1_u64(val);
     }
     vuint64m1_t val;
 };
@@ -190,11 +190,11 @@ struct v_int64x2
     v_int64x2(int64 v0, int64 v1)
     {
         int64 v[] = {v0, v1};
-        val = (vint64m1_t)vle_v_i64m1((long*)v, 2);
+        val = (vint64m1_t)vle64_v_i64m1((long*)v, 2);
     }
     int64 get0() const
     {
-        return vmv_x_s_i64m1_i64(val, 2);
+        return vmv_x_s_i64m1_i64(val);
     }
     vint64m1_t val;
 };
@@ -209,21 +209,21 @@ struct v_float64x2
     v_float64x2(double v0, double v1)
     {
         double v[] = {v0, v1};
-        val = (vfloat64m1_t)vle_v_f64m1((double*)v, 2);
+        val = (vfloat64m1_t)vle64_v_f64m1((double*)v, 2);
     }
     double get0() const
     {
-        return vfmv_f_s_f64m1_f64(val, 2);
+        return vfmv_f_s_f64m1_f64(val);
     }
     vfloat64m1_t val;
 };
-
+/*
 #define OPENCV_HAL_IMPL_RISCVV_INIT(_Tpv, _Tp, suffix) \
-inline _Tp##m1_t vreinterpretq_##suffix##_##suffix(_Tp##m1_t v) { return v; } \
+inline _Tp##m1_t vreinterpret_v_##suffix##m1_##suffix##m1(_Tp##m1_t v) { return v; } \
 inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16((vuint8m1_t)(v.val)); } \
 inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16((vint8m1_t)(v.val)); } \
 inline v_uint16x8 v_reinterpret_as_u16(const v_##_Tpv& v) { return v_uint16x8((vuint16m1_t)(v.val)); } \
-inline v_int16x8 v_reinterpret_as_s16(const v_##_Tpv& v) { return v_int16x8((vint16m1_t)(v.val)); } \
+inline v_int16x8 v_reinterpret_as_s16(const v_##_Tpv& v) { return v_int16x8(vreinterpret_v_i8m1_i16m1(v.val)); } \
 inline v_uint32x4 v_reinterpret_as_u32(const v_##_Tpv& v) { return v_uint32x4((vuint32m1_t)(v.val)); } \
 inline v_int32x4 v_reinterpret_as_s32(const v_##_Tpv& v) { return v_int32x4((vint32m1_t)(v.val)); } \
 inline v_uint64x2 v_reinterpret_as_u64(const v_##_Tpv& v) { return v_uint64x2((vuint64m1_t)(v.val)); } \
@@ -233,17 +233,128 @@ inline v_float64x2 v_reinterpret_as_f64(const v_##_Tpv& v) { return v_float64x2(
 
 
 OPENCV_HAL_IMPL_RISCVV_INIT(uint8x16, vuint8, u8)
-OPENCV_HAL_IMPL_RISCVV_INIT(int8x16, vint8, s8)
+OPENCV_HAL_IMPL_RISCVV_INIT(int8x16, vint8, i8)
 OPENCV_HAL_IMPL_RISCVV_INIT(uint16x8, vuint16, u16)
-OPENCV_HAL_IMPL_RISCVV_INIT(int16x8, vint16, s16)
+OPENCV_HAL_IMPL_RISCVV_INIT(int16x8, vint16, i16)
 OPENCV_HAL_IMPL_RISCVV_INIT(uint32x4, vuint32, u32)
-OPENCV_HAL_IMPL_RISCVV_INIT(int32x4, vint32, s32)
+OPENCV_HAL_IMPL_RISCVV_INIT(int32x4, vint32, i32)
 OPENCV_HAL_IMPL_RISCVV_INIT(uint64x2, vuint64, u64)
-OPENCV_HAL_IMPL_RISCVV_INIT(int64x2, vint64, s64)
+OPENCV_HAL_IMPL_RISCVV_INIT(int64x2, vint64, i64)
 OPENCV_HAL_IMPL_RISCVV_INIT(float64x2, vfloat64, f64)
 OPENCV_HAL_IMPL_RISCVV_INIT(float32x4, vfloat32, f32)
+*/
+inline v_uint8x16 v_reinterpret_as_u8(const v_uint8x16& v) { return v_uint8x16(v.val); }
+inline v_int8x16 v_reinterpret_as_s8(const v_uint8x16& v) { return v_int8x16(vreinterpret_v_u8m1_i8m1(v.val)); }
+inline v_uint16x8 v_reinterpret_as_u16(const v_uint8x16& v) { return v_uint16x8(vreinterpret_v_u8m1_u16m1(v.val)); }
+inline v_int16x8 v_reinterpret_as_s16(const v_uint8x16& v) { return v_int16x8(vreinterpret_v_u16m1_i16m1(vreinterpret_v_u8m1_u16m1(v.val))); }
+inline v_uint32x4 v_reinterpret_as_u32(const v_uint8x16& v) { return v_uint32x4(vreinterpret_v_u8m1_u32m1(v.val)); }
+inline v_int32x4 v_reinterpret_as_s32(const v_uint8x16& v) { return v_int32x4(vreinterpret_v_u32m1_i32m1(vreinterpret_v_u8m1_u32m1(v.val))); }
+inline v_uint64x2 v_reinterpret_as_u64(const v_uint8x16& v) { return v_uint64x2(vreinterpret_v_u8m1_u64m1(v.val)); }
+inline v_int64x2 v_reinterpret_as_s64(const v_uint8x16& v) { return v_int64x2(vreinterpret_v_u64m1_i64m1(vreinterpret_v_u8m1_u64m1(v.val))); }
+inline v_float32x4 v_reinterpret_as_f32(const v_uint8x16& v) { return v_float32x4(vreinterpret_v_u32m1_f32m1(vreinterpret_v_u8m1_u32m1(v.val))); }
+inline v_float64x2 v_reinterpret_as_f64(const v_uint8x16& v) { return v_float64x2(vreinterpret_v_u64m1_f64m1(vreinterpret_v_u8m1_u64m1(v.val))); }
+
+inline v_uint8x16 v_reinterpret_as_u8(const v_int8x16& v) { return v_uint8x16(vreinterpret_v_i8m1_u8m1(v.val)); }
+inline v_int8x16 v_reinterpret_as_s8(const v_int8x16& v) { return v_int8x16(v.val); }
+inline v_uint16x8 v_reinterpret_as_u16(const v_int8x16& v) { return v_uint16x8(vreinterpret_v_u8m1_u16m1(vreinterpret_v_i8m1_u8m1(v.val))); }
+inline v_int16x8 v_reinterpret_as_s16(const v_int8x16& v) { return v_int16x8(vreinterpret_v_i8m1_i16m1(v.val)); }
+inline v_uint32x4 v_reinterpret_as_u32(const v_int8x16& v) { return v_uint32x4(vreinterpret_v_u8m1_u32m1(vreinterpret_v_i8m1_u8m1(v.val))); }
+inline v_int32x4 v_reinterpret_as_s32(const v_int8x16& v) { return v_int32x4(vreinterpret_v_i8m1_i32m1(v.val)); }
+inline v_uint64x2 v_reinterpret_as_u64(const v_int8x16& v) { return v_uint64x2(vreinterpret_v_u8m1_u64m1(vreinterpret_v_i8m1_u8m1(v.val))); }
+inline v_int64x2 v_reinterpret_as_s64(const v_int8x16& v) { return v_int64x2(vreinterpret_v_i8m1_i64m1(v.val)); }
+inline v_float32x4 v_reinterpret_as_f32(const v_int8x16& v) { return v_float32x4(vreinterpret_v_i32m1_f32m1(vreinterpret_v_i8m1_i32m1(v.val))); }
+inline v_float64x2 v_reinterpret_as_f64(const v_int8x16& v) { return v_float64x2(vreinterpret_v_i64m1_f64m1(vreinterpret_v_i8m1_i64m1(v.val))); }
+
+inline v_uint8x16 v_reinterpret_as_u8(const v_uint16x8& v) { return v_uint8x16(vreinterpret_v_u16m1_u8m1(v.val)); }
+inline v_int8x16 v_reinterpret_as_s8(const v_uint16x8& v) { return v_int8x16(vreinterpret_v_i16m1_i8m1(vreinterpret_v_u16m1_i16m1(v.val))); }
+inline v_uint16x8 v_reinterpret_as_u16(const v_uint16x8& v) { return v_uint16x8(v.val); }
+inline v_int16x8 v_reinterpret_as_s16(const v_uint16x8& v) { return v_int16x8(vreinterpret_v_u16m1_i16m1(v.val)); }
+inline v_uint32x4 v_reinterpret_as_u32(const v_uint16x8& v) { return v_uint32x4(vreinterpret_v_u16m1_u32m1(v.val)); }
+inline v_int32x4 v_reinterpret_as_s32(const v_uint16x8& v) { return v_int32x4(vreinterpret_v_u32m1_i32m1(vreinterpret_v_u16m1_u32m1(v.val))); }
+inline v_uint64x2 v_reinterpret_as_u64(const v_uint16x8& v) { return v_uint64x2(vreinterpret_v_u16m1_u64m1(v.val)); }
+inline v_int64x2 v_reinterpret_as_s64(const v_uint16x8& v) { return v_int64x2(vreinterpret_v_u64m1_i64m1(vreinterpret_v_u16m1_u64m1(v.val))); }
+inline v_float32x4 v_reinterpret_as_f32(const v_uint16x8& v) { return v_float32x4(vreinterpret_v_u32m1_f32m1(vreinterpret_v_u16m1_u32m1(v.val))); }
+inline v_float64x2 v_reinterpret_as_f64(const v_uint16x8& v) { return v_float64x2(vreinterpret_v_u64m1_f64m1(vreinterpret_v_u16m1_u64m1(v.val))); }
+
+inline v_uint8x16 v_reinterpret_as_u8(const v_int16x8& v) { return v_uint8x16(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i16m1_i8m1(v.val))); }
+inline v_int8x16 v_reinterpret_as_s8(const v_int16x8& v) { return v_int8x16(vreinterpret_v_i16m1_i8m1(v.val)); }
+inline v_uint16x8 v_reinterpret_as_u16(const v_int16x8& v) { return v_uint16x8(vreinterpret_v_i16m1_u16m1(v.val)); }
+inline v_int16x8 v_reinterpret_as_s16(const v_int16x8& v) { return v_int16x8(v.val); }
+inline v_uint32x4 v_reinterpret_as_u32(const v_int16x8& v) { return v_uint32x4(vreinterpret_v_u16m1_u32m1(vreinterpret_v_i16m1_u16m1(v.val))); }
+inline v_int32x4 v_reinterpret_as_s32(const v_int16x8& v) { return v_int32x4(vreinterpret_v_i16m1_i32m1(v.val)); }
+inline v_uint64x2 v_reinterpret_as_u64(const v_int16x8& v) { return v_uint64x2(vreinterpret_v_u16m1_u64m1(vreinterpret_v_i16m1_u16m1(v.val))); }
+inline v_int64x2 v_reinterpret_as_s64(const v_int16x8& v) { return v_int64x2(vreinterpret_v_i16m1_i64m1(v.val)); }
+inline v_float32x4 v_reinterpret_as_f32(const v_int16x8& v) { return v_float32x4(vreinterpret_v_i32m1_f32m1(vreinterpret_v_i16m1_i32m1(v.val))); }
+inline v_float64x2 v_reinterpret_as_f64(const v_int16x8& v) { return v_float64x2(vreinterpret_v_i64m1_f64m1(vreinterpret_v_i16m1_i64m1(v.val))); }
+
+inline v_uint8x16 v_reinterpret_as_u8(const v_uint32x4& v) { return v_uint8x16(vreinterpret_v_u32m1_u8m1(v.val)); }
+inline v_int8x16 v_reinterpret_as_s8(const v_uint32x4& v) { return v_int8x16(vreinterpret_v_i32m1_i8m1(vreinterpret_v_u32m1_i32m1(v.val))); }
+inline v_uint16x8 v_reinterpret_as_u16(const v_uint32x4& v) { return v_uint16x8(vreinterpret_v_u32m1_u16m1(v.val)); }
+inline v_int16x8 v_reinterpret_as_s16(const v_uint32x4& v) { return v_int16x8(vreinterpret_v_i32m1_i16m1(vreinterpret_v_u32m1_i32m1(v.val))); }
+inline v_uint32x4 v_reinterpret_as_u32(const v_uint32x4& v) { return v_uint32x4(v.val); }
+inline v_int32x4 v_reinterpret_as_s32(const v_uint32x4& v) { return v_int32x4(vreinterpret_v_u32m1_i32m1(v.val)); }
+inline v_uint64x2 v_reinterpret_as_u64(const v_uint32x4& v) { return v_uint64x2(vreinterpret_v_u32m1_u64m1(v.val)); }
+inline v_int64x2 v_reinterpret_as_s64(const v_uint32x4& v) { return v_int64x2(vreinterpret_v_u64m1_i64m1(vreinterpret_v_u32m1_u64m1(v.val))); }
+inline v_float32x4 v_reinterpret_as_f32(const v_uint32x4& v) { return v_float32x4(vreinterpret_v_u32m1_f32m1(v.val)); }
+inline v_float64x2 v_reinterpret_as_f64(const v_uint32x4& v) { return v_float64x2(vreinterpret_v_u64m1_f64m1(vreinterpret_v_u32m1_u64m1(v.val))); }
+
+inline v_uint8x16 v_reinterpret_as_u8(const v_int32x4& v) { return v_uint8x16(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i32m1_i8m1(v.val))); }
+inline v_int8x16 v_reinterpret_as_s8(const v_int32x4& v) { return v_int8x16(vreinterpret_v_i32m1_i8m1(v.val)); }
+inline v_uint16x8 v_reinterpret_as_u16(const v_int32x4& v) { return v_uint16x8(vreinterpret_v_u32m1_u16m1(vreinterpret_v_i32m1_u32m1(v.val))); }
+inline v_int16x8 v_reinterpret_as_s16(const v_int32x4& v) { return v_int16x8(vreinterpret_v_i32m1_i16m1(v.val)); }
+inline v_uint32x4 v_reinterpret_as_u32(const v_int32x4& v) { return v_uint32x4(vreinterpret_v_i32m1_u32m1(v.val)); }
+inline v_int32x4 v_reinterpret_as_s32(const v_int32x4& v) { return v_int32x4(v.val); }
+inline v_uint64x2 v_reinterpret_as_u64(const v_int32x4& v) { return v_uint64x2(vreinterpret_v_u32m1_u64m1(vreinterpret_v_i32m1_u32m1(v.val))); }
+inline v_int64x2 v_reinterpret_as_s64(const v_int32x4& v) { return v_int64x2(vreinterpret_v_i32m1_i64m1(v.val)); }
+inline v_float32x4 v_reinterpret_as_f32(const v_int32x4& v) { return v_float32x4(vreinterpret_v_i32m1_f32m1(v.val)); }
+inline v_float64x2 v_reinterpret_as_f64(const v_int32x4& v) { return v_float64x2(vreinterpret_v_i64m1_f64m1(vreinterpret_v_i32m1_i64m1(v.val))); }
+
+inline v_uint8x16 v_reinterpret_as_u8(const v_uint64x2& v) { return v_uint8x16(vreinterpret_v_u64m1_u8m1(v.val)); }
+inline v_int8x16 v_reinterpret_as_s8(const v_uint64x2& v) { return v_int8x16(vreinterpret_v_i64m1_i8m1(vreinterpret_v_u64m1_i64m1(v.val))); }
+inline v_uint16x8 v_reinterpret_as_u16(const v_uint64x2& v) { return v_uint16x8(vreinterpret_v_u64m1_u16m1(v.val)); }
+inline v_int16x8 v_reinterpret_as_s16(const v_uint64x2& v) { return v_int16x8(vreinterpret_v_i64m1_i16m1(vreinterpret_v_u64m1_i64m1(v.val))); }
+inline v_uint32x4 v_reinterpret_as_u32(const v_uint64x2& v) { return v_uint32x4(vreinterpret_v_u64m1_u32m1(v.val)); }
+inline v_int32x4 v_reinterpret_as_s32(const v_uint64x2& v) { return v_int32x4(vreinterpret_v_i64m1_i32m1(vreinterpret_v_u64m1_i64m1(v.val))); }
+inline v_uint64x2 v_reinterpret_as_u64(const v_uint64x2& v) { return v_uint64x2(v.val); }
+inline v_int64x2 v_reinterpret_as_s64(const v_uint64x2& v) { return v_int64x2(vreinterpret_v_u64m1_i64m1(v.val)); }
+inline v_float32x4 v_reinterpret_as_f32(const v_uint64x2& v) { return v_float32x4(vreinterpret_v_u32m1_f32m1(vreinterpret_v_u64m1_u32m1(v.val))); }
+inline v_float64x2 v_reinterpret_as_f64(const v_uint64x2& v) { return v_float64x2(vreinterpret_v_u64m1_f64m1(v.val)); }
+
+inline v_uint8x16 v_reinterpret_as_u8(const v_int64x2& v) { return v_uint8x16(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i64m1_i8m1(v.val))); }
+inline v_int8x16 v_reinterpret_as_s8(const v_int64x2& v) { return v_int8x16(vreinterpret_v_i64m1_i8m1(v.val)); }
+inline v_uint16x8 v_reinterpret_as_u16(const v_int64x2& v) { return v_uint16x8(vreinterpret_v_u64m1_u16m1(vreinterpret_v_i64m1_u64m1(v.val))); }
+inline v_int16x8 v_reinterpret_as_s16(const v_int64x2& v) { return v_int16x8(vreinterpret_v_i64m1_i16m1(v.val)); }
+inline v_uint32x4 v_reinterpret_as_u32(const v_int64x2& v) { return v_uint32x4(vreinterpret_v_u64m1_u32m1(vreinterpret_v_i64m1_u64m1(v.val))); }
+inline v_int32x4 v_reinterpret_as_s32(const v_int64x2& v) { return v_int32x4(vreinterpret_v_i64m1_i32m1(v.val)); }
+inline v_uint64x2 v_reinterpret_as_u64(const v_int64x2& v) { return v_uint64x2(vreinterpret_v_i64m1_u64m1(v.val)); }
+inline v_int64x2 v_reinterpret_as_s64(const v_int64x2& v) { return v_int64x2(v.val); }
+inline v_float32x4 v_reinterpret_as_f32(const v_int64x2& v) { return v_float32x4(vreinterpret_v_i32m1_f32m1(vreinterpret_v_i64m1_i32m1(v.val))); }
+inline v_float64x2 v_reinterpret_as_f64(const v_int64x2& v) { return v_float64x2(vreinterpret_v_i64m1_f64m1(v.val)); }
+
+inline v_uint8x16 v_reinterpret_as_u8(const v_float32x4& v) { return v_uint8x16(vreinterpret_v_u32m1_u8m1(vreinterpret_v_f32m1_u32m1(v.val))); }
+inline v_int8x16 v_reinterpret_as_s8(const v_float32x4& v) { return v_int8x16(vreinterpret_v_i32m1_i8m1(vreinterpret_v_f32m1_i32m1(v.val))); }
+inline v_uint16x8 v_reinterpret_as_u16(const v_float32x4& v) { return v_uint16x8(vreinterpret_v_u32m1_u16m1(vreinterpret_v_f32m1_u32m1(v.val))); }
+inline v_int16x8 v_reinterpret_as_s16(const v_float32x4& v) { return v_int16x8(vreinterpret_v_i32m1_i16m1(vreinterpret_v_f32m1_i32m1(v.val))); }
+inline v_uint32x4 v_reinterpret_as_u32(const v_float32x4& v) { return v_uint32x4(vreinterpret_v_f32m1_u32m1(v.val)); }
+inline v_int32x4 v_reinterpret_as_s32(const v_float32x4& v) { return v_int32x4(vreinterpret_v_f32m1_i32m1(v.val)); }
+inline v_uint64x2 v_reinterpret_as_u64(const v_float32x4& v) { return v_uint64x2(vreinterpret_v_u32m1_u64m1(vreinterpret_v_f32m1_u32m1(v.val))); }
+inline v_int64x2 v_reinterpret_as_s64(const v_float32x4& v) { return v_int64x2(vreinterpret_v_i32m1_i64m1(vreinterpret_v_f32m1_i32m1(v.val))); }
+inline v_float32x4 v_reinterpret_as_f32(const v_float32x4& v) { return v_float32x4(v.val); }
+inline v_float64x2 v_reinterpret_as_f64(const v_float32x4& v) { return v_float64x2(vreinterpret_v_i64m1_f64m1(vreinterpret_v_i32m1_i64m1(vreinterpret_v_f32m1_i32m1(v.val)))); }
+
+inline v_uint8x16 v_reinterpret_as_u8(const v_float64x2& v) { return v_uint8x16(vreinterpret_v_u64m1_u8m1(vreinterpret_v_f64m1_u64m1(v.val))); }
+inline v_int8x16 v_reinterpret_as_s8(const v_float64x2& v) { return v_int8x16(vreinterpret_v_i64m1_i8m1(vreinterpret_v_f64m1_i64m1(v.val))); }
+inline v_uint16x8 v_reinterpret_as_u16(const v_float64x2& v) { return v_uint16x8(vreinterpret_v_u64m1_u16m1(vreinterpret_v_f64m1_u64m1(v.val))); }
+inline v_int16x8 v_reinterpret_as_s16(const v_float64x2& v) { return v_int16x8(vreinterpret_v_i64m1_i16m1(vreinterpret_v_f64m1_i64m1(v.val))); }
+inline v_uint32x4 v_reinterpret_as_u32(const v_float64x2& v) { return v_uint32x4(vreinterpret_v_u64m1_u32m1(vreinterpret_v_f64m1_u64m1(v.val))); }
+inline v_int32x4 v_reinterpret_as_s32(const v_float64x2& v) { return v_int32x4(vreinterpret_v_i64m1_i32m1(vreinterpret_v_f64m1_i64m1(v.val))); }
+inline v_uint64x2 v_reinterpret_as_u64(const v_float64x2& v) { return v_uint64x2(vreinterpret_v_f64m1_u64m1(v.val)); }
+inline v_int64x2 v_reinterpret_as_s64(const v_float64x2& v) { return v_int64x2(vreinterpret_v_f64m1_i64m1(v.val)); }
+inline v_float32x4 v_reinterpret_as_f32(const v_float64x2& v) { return v_float32x4(vreinterpret_v_i32m1_f32m1(vreinterpret_v_i64m1_i32m1(vreinterpret_v_f64m1_i64m1(v.val)))); }
+inline v_float64x2 v_reinterpret_as_f64(const v_float64x2& v) { return v_float64x2(v.val); }
+
 #define OPENCV_HAL_IMPL_RISCVV_INIT_SET(__Tp, _Tp, suffix, len, num) \
-inline v_##_Tp##x##num v_setzero_##suffix() { return v_##_Tp##x##num((v##_Tp##m1_t){0}); }     \
+inline v_##_Tp##x##num v_setzero_##suffix() { return v_##_Tp##x##num(vmv_v_x_##len##m1(0, num)); }     \
 inline v_##_Tp##x##num v_setall_##suffix(__Tp v) { return v_##_Tp##x##num(vmv_v_x_##len##m1(v, num)); }
 
 OPENCV_HAL_IMPL_RISCVV_INIT_SET(uchar, uint8, u8, u8, 16)
@@ -254,7 +365,7 @@ OPENCV_HAL_IMPL_RISCVV_INIT_SET(unsigned int, uint32, u32, u32, 4)
 OPENCV_HAL_IMPL_RISCVV_INIT_SET(int, int32, s32, i32, 4)
 OPENCV_HAL_IMPL_RISCVV_INIT_SET(unsigned long, uint64, u64, u64, 2)
 OPENCV_HAL_IMPL_RISCVV_INIT_SET(long, int64, s64, i64, 2)
-inline v_float32x4 v_setzero_f32() { return v_float32x4((vfloat32m1_t){0}); }
+inline v_float32x4 v_setzero_f32() { return v_float32x4(vfmv_v_f_f32m1(0, 4)); }
 inline v_float32x4 v_setall_f32(float v) { return v_float32x4(vfmv_v_f_f32m1(v, 4)); }
 
 inline v_float64x2 v_setzero_f64() { return v_float64x2(vfmv_v_f_f64m1(0, 2)); }
@@ -401,10 +512,10 @@ inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
                             const v_float32x4& m1, const v_float32x4& m2,
                             const v_float32x4& m3)
 {
-    vfloat32m1_t res = vfmul_vf_f32m1(m0.val, v.val[0], 4);//vmuli_f32(m0.val, v.val, 0);
-    res = vfmacc_vf_f32m1(res, v.val[1], m1.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
-    res = vfmacc_vf_f32m1(res, v.val[2], m2.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
-    res = vfmacc_vf_f32m1(res, v.val[3], m3.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
+    vfloat32m1_t res = vfmul_vv_f32m1(m0.val, vrgather_vx_f32m1(v.val, 0, 4), 4);//vmuli_f32(m0.val, v.val, 0);
+    res = vfmacc_vv_f32m1(res, vrgather_vx_f32m1(v.val, 1, 4), m1.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
+    res = vfmacc_vv_f32m1(res, vrgather_vx_f32m1(v.val, 2, 4), m2.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
+    res = vfmacc_vv_f32m1(res, vrgather_vx_f32m1(v.val, 3, 4), m3.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
     return v_float32x4(res);
 }
 
@@ -412,9 +523,9 @@ inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
                                const v_float32x4& m1, const v_float32x4& m2,
                                const v_float32x4& a)
 {
-    vfloat32m1_t res = vfmul_vf_f32m1(m0.val, v.val[0], 4);//vmuli_f32(m0.val, v.val, 0);
-    res = vfmacc_vf_f32m1(res, v.val[1], m1.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
-    res = vfmacc_vf_f32m1(res, v.val[2], m2.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
+    vfloat32m1_t res = vfmul_vv_f32m1(m0.val, vrgather_vx_f32m1(v.val, 0, 4), 4);//vmuli_f32(m0.val, v.val, 0);
+    res = vfmacc_vv_f32m1(res, vrgather_vx_f32m1(v.val, 1, 4), m1.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
+    res = vfmacc_vv_f32m1(res, vrgather_vx_f32m1(v.val, 2, 4), m2.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
     res = vfadd_vv_f32m1(res, a.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
     return v_float32x4(res);
 }
@@ -471,11 +582,11 @@ OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_int64x2,  i64m1, 2)
 #define OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(bin_op, intrin) \
 inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
 { \
-    return v_float32x4(vfloat32m1_t(intrin(vint32m1_t(a.val), vint32m1_t(b.val), 4))); \
+    return v_float32x4(vreinterpret_v_i32m1_f32m1(intrin(vreinterpret_v_f32m1_i32m1(a.val), vreinterpret_v_f32m1_i32m1(b.val), 4))); \
 } \
 inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
 { \
-    a.val = vfloat32m1_t(intrin(vint32m1_t(a.val), vint32m1_t(b.val), 4)); \
+    a.val = vreinterpret_v_i32m1_f32m1(intrin(vreinterpret_v_f32m1_i32m1(a.val), vreinterpret_v_f32m1_i32m1(b.val), 4)); \
     return a; \
 }
 
@@ -485,17 +596,17 @@ OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(^, vxor_vv_i32m1)
 
 inline v_float32x4 operator ~ (const v_float32x4& a)
 {
-    return v_float32x4((vfloat32m1_t)(vnot_v_i32m1((vint32m1_t)(a.val), 4)));
+    return v_float32x4(vreinterpret_v_i32m1_f32m1(vnot_v_i32m1(vreinterpret_v_f32m1_i32m1(a.val), 4)));
 }
 
 #define OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(bin_op, intrin) \
 inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \
 { \
-    return v_float64x2(vfloat64m1_t(intrin(vint64m1_t(a.val), vint64m1_t(b.val), 2))); \
+    return v_float64x2(vreinterpret_v_i64m1_f64m1(intrin(vreinterpret_v_f64m1_i64m1(a.val), vreinterpret_v_f64m1_i64m1(b.val), 2))); \
 } \
 inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \
 { \
-    a.val = vfloat64m1_t(intrin(vint64m1_t(a.val), vint64m1_t(b.val), 2)); \
+    a.val = vreinterpret_v_i64m1_f64m1(intrin(vreinterpret_v_f64m1_i64m1(a.val), vreinterpret_v_f64m1_i64m1(b.val), 2)); \
     return a; \
 }
 
@@ -505,7 +616,7 @@ OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(^, vxor_vv_i64m1)
 
 inline v_float64x2 operator ~ (const v_float64x2& a)
 {
-    return v_float64x2((vfloat64m1_t)(vnot_v_i64m1((vint64m1_t)(a.val), 2)));
+    return v_float64x2(vreinterpret_v_i64m1_f64m1(vnot_v_i64m1(vreinterpret_v_f64m1_i64m1(a.val), 2)));
 }
 inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
 {
@@ -527,19 +638,19 @@ inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
 inline v_uint32x4 v_abs(v_int32x4 x)
 {
     vbool32_t mask=vmslt_vx_i32m1_b32(x.val, 0, 4);
-    return v_uint32x4((vuint32m1_t)vrsub_vx_i32m1_m(mask, x.val, x.val, 0, 4));
+    return v_uint32x4(vreinterpret_v_i32m1_u32m1(vrsub_vx_i32m1_m(mask, x.val, x.val, 0, 4)));
 }
 
 inline v_uint16x8 v_abs(v_int16x8 x)
 {
     vbool16_t mask=vmslt_vx_i16m1_b16(x.val, 0, 8);
-    return v_uint16x8((vuint16m1_t)vrsub_vx_i16m1_m(mask, x.val, x.val, 0, 8));
+    return v_uint16x8(vreinterpret_v_i16m1_u16m1(vrsub_vx_i16m1_m(mask, x.val, x.val, 0, 8)));
 }
 
 inline v_uint8x16 v_abs(v_int8x16 x)
 {
     vbool8_t mask=vmslt_vx_i8m1_b8(x.val, 0, 16);
-    return v_uint8x16((vuint8m1_t)vrsub_vx_i8m1_m(mask, x.val, x.val, 0, 16));
+    return v_uint8x16(vreinterpret_v_i8m1_u8m1(vrsub_vx_i8m1_m(mask, x.val, x.val, 0, 16)));
 }
 
 inline v_float32x4 v_abs(v_float32x4 x)
@@ -591,7 +702,7 @@ inline v_int16x8 v_absdiffs(v_int16x8 a, v_int16x8 b){
 inline v_uint##_Tpvec v_absdiff(v_int##_Tpvec a, v_int##_Tpvec b){    \
      vint##_Tpv##_t max = vmax_vv_i##_Tpv(a.val, b.val, num);\
      vint##_Tpv##_t min = vmin_vv_i##_Tpv(a.val, b.val, num);\
-    return v_uint##_Tpvec((vuint##_Tpv##_t)vsub_vv_i##_Tpv(max, min, num));    \
+    return v_uint##_Tpvec(vreinterpret_v_i##_Tpv##_u##_Tpv(vsub_vv_i##_Tpv(max, min, num)));    \
 }
 
 OPENCV_HAL_IMPL_RISCVV_ABSDIFF(8x16, 8m1, 16)
@@ -604,8 +715,8 @@ inline void v_mul_expand(const v_int8x16& a, const v_int8x16& b,
 {
     vint16m2_t res = vundefined_i16m2();
     res = vwmul_vv_i16m2(a.val, b.val, 16);
-    c.val = vget_i16m2_i16m1(res, 0);
-    d.val = vget_i16m2_i16m1(res, 1);
+    c.val = vget_v_i16m2_i16m1(res, 0);
+    d.val = vget_v_i16m2_i16m1(res, 1);
 }
 
 inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b,
@@ -613,8 +724,8 @@ inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b,
 {
     vuint16m2_t res = vundefined_u16m2();
     res = vwmulu_vv_u16m2(a.val, b.val, 16);
-    c.val = vget_u16m2_u16m1(res, 0);
-    d.val = vget_u16m2_u16m1(res, 1);
+    c.val = vget_v_u16m2_u16m1(res, 0);
+    d.val = vget_v_u16m2_u16m1(res, 1);
 }
 
 inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
@@ -622,8 +733,8 @@ inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
 {
     vint32m2_t res = vundefined_i32m2();
     res = vwmul_vv_i32m2(a.val, b.val, 8);
-    c.val = vget_i32m2_i32m1(res, 0);
-    d.val = vget_i32m2_i32m1(res, 1);
+    c.val = vget_v_i32m2_i32m1(res, 0);
+    d.val = vget_v_i32m2_i32m1(res, 1);
 }
 
 inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
@@ -631,8 +742,8 @@ inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
 {
     vuint32m2_t res = vundefined_u32m2();
     res = vwmulu_vv_u32m2(a.val, b.val, 8);
-    c.val = vget_u32m2_u32m1(res, 0);
-    d.val = vget_u32m2_u32m1(res, 1);
+    c.val = vget_v_u32m2_u32m1(res, 0);
+    d.val = vget_v_u32m2_u32m1(res, 1);
 }
 
 inline void v_mul_expand(const v_int32x4& a, const v_int32x4& b,
@@ -640,8 +751,8 @@ inline void v_mul_expand(const v_int32x4& a, const v_int32x4& b,
 {
     vint64m2_t res = vundefined_i64m2();
     res = vwmul_vv_i64m2(a.val, b.val, 4);
-    c.val = vget_i64m2_i64m1(res, 0);
-    d.val = vget_i64m2_i64m1(res, 1);
+    c.val = vget_v_i64m2_i64m1(res, 0);
+    d.val = vget_v_i64m2_i64m1(res, 1);
 }
 
 inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
@@ -649,8 +760,8 @@ inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
 {
     vuint64m2_t res = vundefined_u64m2();
     res = vwmulu_vv_u64m2(a.val, b.val, 4);
-    c.val = vget_u64m2_u64m1(res, 0);
-    d.val = vget_u64m2_u64m1(res, 1);
+    c.val = vget_v_u64m2_u64m1(res, 0);
+    d.val = vget_v_u64m2_u64m1(res, 1);
 }
 
 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint8x16, v_add_wrap, vadd_vv_u8m1, 16)
@@ -669,118 +780,202 @@ OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int16x8, v_mul_wrap, vmul_vv_i16m1, 8)
 // 16 >> 32
 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
 {
+    vuint32m2_t vindex = vundefined_u32m2();
+    vuint32m1_t vindex0 = vid_v_u32m1(4);
+    vindex0 = vsll_vx_u32m1(vindex0, 1, 4);
+    vindex = vset_v_u32m1_u32m2(vindex, 0, vindex0);
+    vindex = vset_v_u32m1_u32m2(vindex, 1, vadd_vx_u32m1(vindex0, 1, 4));
     vint32m2_t res = vundefined_i32m2();
     res = vwmul_vv_i32m2(a.val, b.val, 8);
-    res = vrgather_vv_i32m2(res, (vuint32m2_t){0, 2, 4, 6, 1, 3, 5, 7}, 8);
-    return v_int32x4(vadd_vv_i32m1(vget_i32m2_i32m1(res, 0), vget_i32m2_i32m1(res, 1), 4));
+    res = vrgather_vv_i32m2(res, vindex, 8);
+    return v_int32x4(vadd_vv_i32m1(vget_v_i32m2_i32m1(res, 0), vget_v_i32m2_i32m1(res, 1), 4));
 }
 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
 {
+    vuint32m2_t vindex = vundefined_u32m2();
+    vuint32m1_t vindex0 = vid_v_u32m1(4);
+    vindex0 = vsll_vx_u32m1(vindex0, 1, 4);
+    vindex = vset_v_u32m1_u32m2(vindex, 0, vindex0);
+    vindex = vset_v_u32m1_u32m2(vindex, 1, vadd_vx_u32m1(vindex0, 1, 4));
     vint32m2_t res = vundefined_i32m2();
     res = vwmul_vv_i32m2(a.val, b.val, 8);
-    res = vrgather_vv_i32m2(res, (vuint32m2_t){0, 2, 4, 6, 1, 3, 5, 7}, 8);
-    return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_i32m2_i32m1(res, 0),vget_i32m2_i32m1(res, 1), 4), c.val, 4));
+    res = vrgather_vv_i32m2(res, vindex, 8);
+    return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_v_i32m2_i32m1(res, 0),vget_v_i32m2_i32m1(res, 1), 4), c.val, 4));
 }
 
 // 32 >> 64
 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
 {
+    vuint64m2_t vindex = vundefined_u64m2();
+    vuint64m1_t vindex0 = vid_v_u64m1(2);
+    vindex0 = vsll_vx_u64m1(vindex0, 1, 2);
+    vindex = vset_v_u64m1_u64m2(vindex, 0, vindex0);
+    vindex = vset_v_u64m1_u64m2(vindex, 1, vadd_vx_u64m1(vindex0, 1, 2));
     vint64m2_t res = vundefined_i64m2();
     res = vwmul_vv_i64m2(a.val, b.val, 4);
-    res = vrgather_vv_i64m2(res, (vuint64m2_t){0, 2, 1, 3}, 4);
-    return v_int64x2(vadd_vv_i64m1(vget_i64m2_i64m1(res, 0), vget_i64m2_i64m1(res, 1), 2));
+    res = vrgather_vv_i64m2(res, vindex, 4);
+    return v_int64x2(vadd_vv_i64m1(vget_v_i64m2_i64m1(res, 0), vget_v_i64m2_i64m1(res, 1), 2));
 }
 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
 {
+    vuint64m2_t vindex = vundefined_u64m2();
+    vuint64m1_t vindex0 = vid_v_u64m1(2);
+    vindex0 = vsll_vx_u64m1(vindex0, 1, 2);
+    vindex = vset_v_u64m1_u64m2(vindex, 0, vindex0);
+    vindex = vset_v_u64m1_u64m2(vindex, 1, vadd_vx_u64m1(vindex0, 1, 2));
     vint64m2_t res = vundefined_i64m2();
     res = vwmul_vv_i64m2(a.val, b.val, 4);
-    res = vrgather_vv_i64m2(res, (vuint64m2_t){0, 2, 1, 3}, 4);
-    return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_i64m2_i64m1(res, 0), vget_i64m2_i64m1(res, 1), 2), c.val, 2));
+    res = vrgather_vv_i64m2(res, vindex, 4);
+    return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_v_i64m2_i64m1(res, 0), vget_v_i64m2_i64m1(res, 1), 2), c.val, 2));
 }
 
 // 8 >> 32
 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
 {
+    vuint32m4_t vindex32 = vundefined_u32m4();
+    vuint32m1_t vindex0 = vid_v_u32m1(4);
+    vindex0 = vsll_vx_u32m1(vindex0, 2, 4);
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 0, vindex0);
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 1, vadd_vx_u32m1(vindex0, 1, 4));
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 2, vadd_vx_u32m1(vindex0, 2, 4));
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 3, vadd_vx_u32m1(vindex0, 3, 4));
+    vuint16m2_t vindex = vnsrl_wx_u16m2(vindex32, 0, 16);
     vuint16m2_t v1 = vundefined_u16m2();
     vuint32m2_t v2 = vundefined_u32m2();
     v1 = vwmulu_vv_u16m2(a.val, b.val, 16);
-    v1 = vrgather_vv_u16m2(v1, (vuint16m2_t){0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}, 16);
-    v2 = vwaddu_vv_u32m2(vget_u16m2_u16m1(v1, 0), vget_u16m2_u16m1(v1, 1), 8);
-    return v_uint32x4(vadd_vv_u32m1(vget_u32m2_u32m1(v2, 0), vget_u32m2_u32m1(v2, 1), 4));
+    v1 = vrgather_vv_u16m2(v1, vindex, 16);
+    v2 = vwaddu_vv_u32m2(vget_v_u16m2_u16m1(v1, 0), vget_v_u16m2_u16m1(v1, 1), 8);
+    return v_uint32x4(vadd_vv_u32m1(vget_v_u32m2_u32m1(v2, 0), vget_v_u32m2_u32m1(v2, 1), 4));
 }
 
 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b,
                                    const v_uint32x4& c)
 {
+    vuint32m4_t vindex32 = vundefined_u32m4();
+    vuint32m1_t vindex0 = vid_v_u32m1(4);
+    vindex0 = vsll_vx_u32m1(vindex0, 2, 4);
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 0, vindex0);
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 1, vadd_vx_u32m1(vindex0, 1, 4));
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 2, vadd_vx_u32m1(vindex0, 2, 4));
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 3, vadd_vx_u32m1(vindex0, 3, 4));
+    vuint16m2_t vindex = vnsrl_wx_u16m2(vindex32, 0, 16);
     vuint16m2_t v1 = vundefined_u16m2();
     vuint32m2_t v2 = vundefined_u32m2();
     v1 = vwmulu_vv_u16m2(a.val, b.val, 16);
-    v1 = vrgather_vv_u16m2(v1, (vuint16m2_t){0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}, 16);
-    v2 = vwaddu_vv_u32m2(vget_u16m2_u16m1(v1, 0), vget_u16m2_u16m1(v1, 1), 8);
-    return v_uint32x4(vadd_vv_u32m1(vadd_vv_u32m1(vget_u32m2_u32m1(v2, 0), vget_u32m2_u32m1(v2, 1), 4), c.val, 4));
+    v1 = vrgather_vv_u16m2(v1, vindex, 16);
+    v2 = vwaddu_vv_u32m2(vget_v_u16m2_u16m1(v1, 0), vget_v_u16m2_u16m1(v1, 1), 8);
+    return v_uint32x4(vadd_vv_u32m1(vadd_vv_u32m1(vget_v_u32m2_u32m1(v2, 0), vget_v_u32m2_u32m1(v2, 1), 4), c.val, 4));
 }
 
 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
 {
+    vuint32m4_t vindex32 = vundefined_u32m4();
+    vuint32m1_t vindex0 = vid_v_u32m1(4);
+    vindex0 = vsll_vx_u32m1(vindex0, 2, 4);
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 0, vindex0);
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 1, vadd_vx_u32m1(vindex0, 1, 4));
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 2, vadd_vx_u32m1(vindex0, 2, 4));
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 3, vadd_vx_u32m1(vindex0, 3, 4));
+    vuint16m2_t vindex = vnsrl_wx_u16m2(vindex32, 0, 16);
     vint16m2_t v1 = vundefined_i16m2();
     vint32m2_t v2 = vundefined_i32m2();
     v1 = vwmul_vv_i16m2(a.val, b.val, 16);
-    v1 = vrgather_vv_i16m2(v1, (vuint16m2_t){0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}, 16);
-    v2 = vwadd_vv_i32m2(vget_i16m2_i16m1(v1, 0), vget_i16m2_i16m1(v1, 1), 8);
-    return v_int32x4(vadd_vv_i32m1(vget_i32m2_i32m1(v2, 0), vget_i32m2_i32m1(v2, 1), 4));
+    v1 = vrgather_vv_i16m2(v1, vindex, 16);
+    v2 = vwadd_vv_i32m2(vget_v_i16m2_i16m1(v1, 0), vget_v_i16m2_i16m1(v1, 1), 8);
+    return v_int32x4(vadd_vv_i32m1(vget_v_i32m2_i32m1(v2, 0), vget_v_i32m2_i32m1(v2, 1), 4));
 }
 
 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b,
                                    const v_int32x4& c)
 {
+    vuint32m4_t vindex32 = vundefined_u32m4();
+    vuint32m1_t vindex0 = vid_v_u32m1(4);
+    vindex0 = vsll_vx_u32m1(vindex0, 2, 4);
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 0, vindex0);
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 1, vadd_vx_u32m1(vindex0, 1, 4));
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 2, vadd_vx_u32m1(vindex0, 2, 4));
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 3, vadd_vx_u32m1(vindex0, 3, 4));
+    vuint16m2_t vindex = vnsrl_wx_u16m2(vindex32, 0, 16);
     vint16m2_t v1 = vundefined_i16m2();
     vint32m2_t v2 = vundefined_i32m2();
     v1 = vwmul_vv_i16m2(a.val, b.val, 16);
-    v1 = vrgather_vv_i16m2(v1, (vuint16m2_t){0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}, 16);
-    v2 = vwadd_vv_i32m2(vget_i16m2_i16m1(v1, 0), vget_i16m2_i16m1(v1, 1), 8);
-    return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_i32m2_i32m1(v2, 0), vget_i32m2_i32m1(v2, 1), 4), c.val, 4));
+    v1 = vrgather_vv_i16m2(v1, vindex, 16);
+    v2 = vwadd_vv_i32m2(vget_v_i16m2_i16m1(v1, 0), vget_v_i16m2_i16m1(v1, 1), 8);
+    return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_v_i32m2_i32m1(v2, 0), vget_v_i32m2_i32m1(v2, 1), 4), c.val, 4));
 }
 
 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
 {
+    vuint64m4_t vindex64 = vundefined_u64m4();
+    vuint64m1_t vindex0 = vid_v_u64m1(2);
+    vindex0 = vsll_vx_u64m1(vindex0, 2, 2);
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 0, vindex0);
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 1, vadd_vx_u64m1(vindex0, 1, 2));
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 2, vadd_vx_u64m1(vindex0, 2, 2));
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 3, vadd_vx_u64m1(vindex0, 3, 2));
+    vuint32m2_t vindex = vnsrl_wx_u32m2(vindex64, 0, 8);
     vuint32m2_t v1 = vundefined_u32m2();
     vuint64m2_t v2 = vundefined_u64m2();
     v1 = vwmulu_vv_u32m2(a.val, b.val, 8);
-    v1 = vrgather_vv_u32m2(v1, (vuint32m2_t){0, 4, 1, 5, 2, 6, 3, 7}, 8);
-    v2 = vwaddu_vv_u64m2(vget_u32m2_u32m1(v1, 0), vget_u32m2_u32m1(v1, 1), 4);
-    return v_uint64x2(vadd_vv_u64m1(vget_u64m2_u64m1(v2, 0), vget_u64m2_u64m1(v2, 1), 2));
+    v1 = vrgather_vv_u32m2(v1, vindex, 8);
+    v2 = vwaddu_vv_u64m2(vget_v_u32m2_u32m1(v1, 0), vget_v_u32m2_u32m1(v1, 1), 4);
+    return v_uint64x2(vadd_vv_u64m1(vget_v_u64m2_u64m1(v2, 0), vget_v_u64m2_u64m1(v2, 1), 2));
 }
 
 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b,
                                    const v_uint64x2& c)
 {
+    vuint64m4_t vindex64 = vundefined_u64m4();
+    vuint64m1_t vindex0 = vid_v_u64m1(2);
+    vindex0 = vsll_vx_u64m1(vindex0, 2, 2);
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 0, vindex0);
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 1, vadd_vx_u64m1(vindex0, 1, 2));
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 2, vadd_vx_u64m1(vindex0, 2, 2));
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 3, vadd_vx_u64m1(vindex0, 3, 2));
+    vuint32m2_t vindex = vnsrl_wx_u32m2(vindex64, 0, 8);
     vuint32m2_t v1 = vundefined_u32m2();
     vuint64m2_t v2 = vundefined_u64m2();
     v1 = vwmulu_vv_u32m2(a.val, b.val, 8);
-    v1 = vrgather_vv_u32m2(v1, (vuint32m2_t){0, 4, 1, 5, 2, 6, 3, 7}, 8);
-    v2 = vwaddu_vv_u64m2(vget_u32m2_u32m1(v1, 0), vget_u32m2_u32m1(v1, 1), 4);
-    return v_uint64x2(vadd_vv_u64m1(vadd_vv_u64m1(vget_u64m2_u64m1(v2, 0), vget_u64m2_u64m1(v2, 1), 2), c.val, 2));
+    v1 = vrgather_vv_u32m2(v1, vindex, 8);
+    v2 = vwaddu_vv_u64m2(vget_v_u32m2_u32m1(v1, 0), vget_v_u32m2_u32m1(v1, 1), 4);
+    return v_uint64x2(vadd_vv_u64m1(vadd_vv_u64m1(vget_v_u64m2_u64m1(v2, 0), vget_v_u64m2_u64m1(v2, 1), 2), c.val, 2));
 }
 
 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
 {
+    vuint64m4_t vindex64 = vundefined_u64m4();
+    vuint64m1_t vindex0 = vid_v_u64m1(2);
+    vindex0 = vsll_vx_u64m1(vindex0, 2, 2);
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 0, vindex0);
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 1, vadd_vx_u64m1(vindex0, 1, 2));
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 2, vadd_vx_u64m1(vindex0, 2, 2));
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 3, vadd_vx_u64m1(vindex0, 3, 2));
+    vuint32m2_t vindex = vnsrl_wx_u32m2(vindex64, 0, 8);
     vint32m2_t v1 = vundefined_i32m2();
     vint64m2_t v2 = vundefined_i64m2();
     v1 = vwmul_vv_i32m2(a.val, b.val, 8);
-    v1 = vrgather_vv_i32m2(v1, (vuint32m2_t){0, 4, 1, 5, 2, 6, 3, 7}, 8);
-    v2 = vwadd_vv_i64m2(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4);
-    return v_int64x2(vadd_vv_i64m1(vget_i64m2_i64m1(v2, 0), vget_i64m2_i64m1(v2, 1), 2));
+    v1 = vrgather_vv_i32m2(v1, vindex, 8);
+    v2 = vwadd_vv_i64m2(vget_v_i32m2_i32m1(v1, 0), vget_v_i32m2_i32m1(v1, 1), 4);
+    return v_int64x2(vadd_vv_i64m1(vget_v_i64m2_i64m1(v2, 0), vget_v_i64m2_i64m1(v2, 1), 2));
 }
 
 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b,
                                    const v_int64x2& c)
 {
+    vuint64m4_t vindex64 = vundefined_u64m4();
+    vuint64m1_t vindex0 = vid_v_u64m1(2);
+    vindex0 = vsll_vx_u64m1(vindex0, 2, 2);
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 0, vindex0);
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 1, vadd_vx_u64m1(vindex0, 1, 2));
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 2, vadd_vx_u64m1(vindex0, 2, 2));
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 3, vadd_vx_u64m1(vindex0, 3, 2));
+    vuint32m2_t vindex = vnsrl_wx_u32m2(vindex64, 0, 8);
     vint32m2_t v1 = vundefined_i32m2();
     vint64m2_t v2 = vundefined_i64m2();
     v1 = vwmul_vv_i32m2(a.val, b.val, 8);
-    v1 = vrgather_vv_i32m2(v1, (vuint32m2_t){0, 4, 1, 5, 2, 6, 3, 7}, 8);
-    v2 = vwadd_vv_i64m2(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4);
-    return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_i64m2_i64m1(v2, 0), vget_i64m2_i64m1(v2, 1), 2), c.val, 2));
+    v1 = vrgather_vv_i32m2(v1, vindex, 8);
+    v2 = vwadd_vv_i64m2(vget_v_i32m2_i32m1(v1, 0), vget_v_i32m2_i32m1(v1, 1), 4);
+    return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_v_i64m2_i64m1(v2, 0), vget_v_i64m2_i64m1(v2, 1), 2), c.val, 2));
 }
 
 //////// Fast Dot Product ////////
@@ -789,14 +984,14 @@ inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
 {
     vint32m2_t v1 = vundefined_i32m2();
     v1 = vwmul_vv_i32m2(a.val, b.val, 8);
-    return v_int32x4(vadd_vv_i32m1(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4));
+    return v_int32x4(vadd_vv_i32m1(vget_v_i32m2_i32m1(v1, 0), vget_v_i32m2_i32m1(v1, 1), 4));
 }
 
 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
 {
     vint32m2_t v1 = vundefined_i32m2();
     v1 = vwmul_vv_i32m2(a.val, b.val, 8);
-    return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4), c.val, 4));
+    return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_v_i32m2_i32m1(v1, 0), vget_v_i32m2_i32m1(v1, 1), 4), c.val, 4));
 }
 
 // 32 >> 64
@@ -804,13 +999,13 @@ inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
 {
     vint64m2_t v1 = vundefined_i64m2();
     v1 = vwmul_vv_i64m2(a.val, b.val, 4);
-    return v_int64x2(vadd_vv_i64m1(vget_i64m2_i64m1(v1, 0), vget_i64m2_i64m1(v1, 1), 2));
+    return v_int64x2(vadd_vv_i64m1(vget_v_i64m2_i64m1(v1, 0), vget_v_i64m2_i64m1(v1, 1), 2));
 }
 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
 {
     vint64m2_t v1 = vundefined_i64m2();
     v1 = vwmul_vv_i64m2(a.val, b.val, 8);
-    return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_i64m2_i64m1(v1, 0), vget_i64m2_i64m1(v1, 1), 4), c.val, 4));
+    return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_v_i64m2_i64m1(v1, 0), vget_v_i64m2_i64m1(v1, 1), 4), c.val, 4));
 }
 
 // 8 >> 32
@@ -819,8 +1014,8 @@ inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b
     vuint16m2_t v1 = vundefined_u16m2();
     vuint32m2_t v2 = vundefined_u32m2();
     v1 = vwmulu_vv_u16m2(a.val, b.val, 16);
-    v2 = vwaddu_vv_u32m2(vget_u16m2_u16m1(v1, 0), vget_u16m2_u16m1(v1, 1), 8);
-    return v_uint32x4(vadd_vv_u32m1(vget_u32m2_u32m1(v2, 0), vget_u32m2_u32m1(v2, 1), 4));
+    v2 = vwaddu_vv_u32m2(vget_v_u16m2_u16m1(v1, 0), vget_v_u16m2_u16m1(v1, 1), 8);
+    return v_uint32x4(vadd_vv_u32m1(vget_v_u32m2_u32m1(v2, 0), vget_v_u32m2_u32m1(v2, 1), 4));
 }
 
 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
@@ -828,8 +1023,8 @@ inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b
     vuint16m2_t v1 = vundefined_u16m2();
     vuint32m2_t v2 = vundefined_u32m2();
     v1 = vwmulu_vv_u16m2(a.val, b.val, 16);
-    v2 = vwaddu_vv_u32m2(vget_u16m2_u16m1(v1, 0), vget_u16m2_u16m1(v1, 1), 8);
-    return v_uint32x4(vadd_vv_u32m1(vadd_vv_u32m1(vget_u32m2_u32m1(v2, 0), vget_u32m2_u32m1(v2, 1), 4), c.val, 4));
+    v2 = vwaddu_vv_u32m2(vget_v_u16m2_u16m1(v1, 0), vget_v_u16m2_u16m1(v1, 1), 8);
+    return v_uint32x4(vadd_vv_u32m1(vadd_vv_u32m1(vget_v_u32m2_u32m1(v2, 0), vget_v_u32m2_u32m1(v2, 1), 4), c.val, 4));
 }
 
 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
@@ -837,16 +1032,16 @@ inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
     vint16m2_t v1 = vundefined_i16m2();
     vint32m2_t v2 = vundefined_i32m2();
     v1 = vwmul_vv_i16m2(a.val, b.val, 16);
-    v2 = vwadd_vv_i32m2(vget_i16m2_i16m1(v1, 0), vget_i16m2_i16m1(v1, 1), 8);
-    return v_int32x4(vadd_vv_i32m1(vget_i32m2_i32m1(v2, 0), vget_i32m2_i32m1(v2, 1), 4));
+    v2 = vwadd_vv_i32m2(vget_v_i16m2_i16m1(v1, 0), vget_v_i16m2_i16m1(v1, 1), 8);
+    return v_int32x4(vadd_vv_i32m1(vget_v_i32m2_i32m1(v2, 0), vget_v_i32m2_i32m1(v2, 1), 4));
 }
 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
 {
     vint16m2_t v1 = vundefined_i16m2();
     vint32m2_t v2 = vundefined_i32m2();
     v1 = vwmul_vv_i16m2(a.val, b.val, 16);
-    v2 = vwadd_vv_i32m2(vget_i16m2_i16m1(v1, 0), vget_i16m2_i16m1(v1, 1), 8);
-    return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_i32m2_i32m1(v2, 0), vget_i32m2_i32m1(v2, 1), 4), c.val, 4));
+    v2 = vwadd_vv_i32m2(vget_v_i16m2_i16m1(v1, 0), vget_v_i16m2_i16m1(v1, 1), 8);
+    return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_v_i32m2_i32m1(v2, 0), vget_v_i32m2_i32m1(v2, 1), 4), c.val, 4));
 }
 
 // 16 >> 64
@@ -855,16 +1050,16 @@ inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b
     vuint32m2_t v1 = vundefined_u32m2();
     vuint64m2_t v2 = vundefined_u64m2();
     v1 = vwmulu_vv_u32m2(a.val, b.val, 8);
-    v2 = vwaddu_vv_u64m2(vget_u32m2_u32m1(v1, 0), vget_u32m2_u32m1(v1, 1), 4);
-    return v_uint64x2(vadd_vv_u64m1(vget_u64m2_u64m1(v2, 0), vget_u64m2_u64m1(v2, 1), 2));
+    v2 = vwaddu_vv_u64m2(vget_v_u32m2_u32m1(v1, 0), vget_v_u32m2_u32m1(v1, 1), 4);
+    return v_uint64x2(vadd_vv_u64m1(vget_v_u64m2_u64m1(v2, 0), vget_v_u64m2_u64m1(v2, 1), 2));
 }
 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
 {
     vuint32m2_t v1 = vundefined_u32m2();
     vuint64m2_t v2 = vundefined_u64m2();
     v1 = vwmulu_vv_u32m2(a.val, b.val, 8);
-    v2 = vwaddu_vv_u64m2(vget_u32m2_u32m1(v1, 0), vget_u32m2_u32m1(v1, 1), 4);
-    return v_uint64x2(vadd_vv_u64m1(vadd_vv_u64m1(vget_u64m2_u64m1(v2, 0), vget_u64m2_u64m1(v2, 1), 2), c.val, 2));
+    v2 = vwaddu_vv_u64m2(vget_v_u32m2_u32m1(v1, 0), vget_v_u32m2_u32m1(v1, 1), 4);
+    return v_uint64x2(vadd_vv_u64m1(vadd_vv_u64m1(vget_v_u64m2_u64m1(v2, 0), vget_v_u64m2_u64m1(v2, 1), 2), c.val, 2));
 }
 
 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
@@ -872,16 +1067,16 @@ inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
     vint32m2_t v1 = vundefined_i32m2();
     vint64m2_t v2 = vundefined_i64m2();
     v1 = vwmul_vv_i32m2(a.val, b.val, 8);
-    v2 = vwadd_vv_i64m2(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4);
-    return v_int64x2(vadd_vv_i64m1(vget_i64m2_i64m1(v2, 0), vget_i64m2_i64m1(v2, 1), 2));
+    v2 = vwadd_vv_i64m2(vget_v_i32m2_i32m1(v1, 0), vget_v_i32m2_i32m1(v1, 1), 4);
+    return v_int64x2(vadd_vv_i64m1(vget_v_i64m2_i64m1(v2, 0), vget_v_i64m2_i64m1(v2, 1), 2));
 }
 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
 {
     vint32m2_t v1 = vundefined_i32m2();
     vint64m2_t v2 = vundefined_i64m2();
     v1 = vwmul_vv_i32m2(a.val, b.val, 8);
-    v2 = vwadd_vv_i64m2(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4);
-    return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_i64m2_i64m1(v2, 0), vget_i64m2_i64m1(v2, 1), 2), c.val, 2));
+    v2 = vwadd_vv_i64m2(vget_v_i32m2_i32m1(v1, 0), vget_v_i32m2_i32m1(v1, 1), 4);
+    return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_v_i64m2_i64m1(v2, 0), vget_v_i64m2_i64m1(v2, 1), 2), c.val, 2));
 }
 
 
@@ -890,16 +1085,16 @@ inline scalartype v_reduce_##func(const v_##_Tpvec##x##num& a) \
 {\
     v##_Tpvec2##m1_t val = vmv_v_x_##len##m1(0, num); \
     val = intrin(val, a.val, val, num);    \
-    return vmv_x_s_##len##m1_##len(val, num);    \
+    return vmv_x_s_##len##m1_##len(val);    \
 }
 
 
-#define OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(_Tpvec, _Tpvec2, scalartype, func, funcu, num) \
+#define OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(_Tpvec, _Tpvec2, scalartype, func, funcu, num, scalerfunc) \
 inline scalartype v_reduce_##func(const v_##_Tpvec##x##num& a) \
 {\
-    v##_Tpvec##m1_t val = (v##_Tpvec##m1_t)vmv_v_x_i8m1(0, num); \
+    v##_Tpvec##m1_t val = vundefined_##_Tpvec2##m1(); \
     val = v##funcu##_vs_##_Tpvec2##m1_##_Tpvec2##m1(val, a.val, a.val, num);    \
-    return val[0];    \
+    return scalerfunc(val);    \
 }
 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(int8, int16, i16, int, sum, vwredsum_vs_i8m1_i16m1, 16)
 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(int16, int32, i32, int, sum, vwredsum_vs_i16m1_i32m1, 8)
@@ -910,30 +1105,30 @@ OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(uint32, uint64, u64, unsigned, sum, vwredsumu
 inline float v_reduce_sum(const v_float32x4& a) \
 {\
     vfloat32m1_t val = vfmv_v_f_f32m1(0.0, 4); \
-    val = vfredsum_vs_f32m1_f32m1(val, a.val, val, 4);    \
-    return vfmv_f_s_f32m1_f32(val, 4);    \
+    val = vfredosum_vs_f32m1_f32m1(val, a.val, val, 4);    \
+    return vfmv_f_s_f32m1_f32(val);    \
 }
 inline double v_reduce_sum(const v_float64x2& a) \
 {\
     vfloat64m1_t val = vfmv_v_f_f64m1(0.0, 2); \
-    val = vfredsum_vs_f64m1_f64m1(val, a.val, val, 2);    \
-    return vfmv_f_s_f64m1_f64(val, 2);    \
+    val = vfredosum_vs_f64m1_f64m1(val, a.val, val, 2);    \
+    return vfmv_f_s_f64m1_f64(val);    \
 }
 inline uint64 v_reduce_sum(const v_uint64x2& a)
-{ return vext_x_v_u64m1_u64((vuint64m1_t)a.val, 0, 2)+vext_x_v_u64m1_u64((vuint64m1_t)a.val, 1, 2); }
+{ vuint64m1_t res = vundefined_u64m1(); return vmv_x_s_u64m1_u64(vredsum_vs_u64m1_u64m1(res, a.val, vmv_v_x_u64m1(0, 2), 2)); }
 
 inline int64 v_reduce_sum(const v_int64x2& a)
-{ return vext_x_v_i64m1_i64((vint64m1_t)a.val, 0, 2)+vext_x_v_i64m1_i64((vint64m1_t)a.val, 1, 2); }
+{ vint64m1_t res = vundefined_i64m1(); return vmv_x_s_i64m1_i64(vredsum_vs_i64m1_i64m1(res, a.val, vmv_v_x_i64m1(0, 2), 2)); }
 
 #define OPENCV_HAL_IMPL_RISCVV_REDUCE_OP(func)    \
-OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int8,  i8, int, func, red##func, 16)    \
-OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int16, i16, int, func, red##func, 8)    \
-OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int32, i32, int, func, red##func, 4)    \
-OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int64, i64, int, func, red##func, 2)    \
-OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(uint8,  u8, unsigned, func, red##func##u, 16)    \
-OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(uint16, u16, unsigned, func, red##func##u, 8)    \
-OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(uint32, u32, unsigned, func, red##func##u, 4)    \
-OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(float32, f32, float, func, fred##func, 4)
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int8,  i8, int, func, red##func, 16, vmv_x_s_i8m1_i8)    \
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int16, i16, int, func, red##func, 8, vmv_x_s_i16m1_i16)    \
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int32, i32, int, func, red##func, 4, vmv_x_s_i32m1_i32)    \
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int64, i64, int, func, red##func, 2, vmv_x_s_i64m1_i64)    \
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(uint8,  u8, unsigned, func, red##func##u, 16, vmv_x_s_u8m1_u8)    \
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(uint16, u16, unsigned, func, red##func##u, 8, vmv_x_s_u16m1_u16)    \
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(uint32, u32, unsigned, func, red##func##u, 4, vmv_x_s_u32m1_u32)    \
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(float32, f32, float, func, fred##func, 4, vfmv_f_s_f32m1_f32)
 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP(max)
 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP(min)
 
@@ -944,11 +1139,15 @@ inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
     vfloat32m1_t b0 = vfmv_v_f_f32m1(0.0, 4);
     vfloat32m1_t c0 = vfmv_v_f_f32m1(0.0, 4);
     vfloat32m1_t d0 = vfmv_v_f_f32m1(0.0, 4);
-    a0 = vfredsum_vs_f32m1_f32m1(a0, a.val, a0, 4);
-    b0 = vfredsum_vs_f32m1_f32m1(b0, b.val, b0, 4);
-    c0 = vfredsum_vs_f32m1_f32m1(c0, c.val, c0, 4);
-    d0 = vfredsum_vs_f32m1_f32m1(d0, d.val, d0, 4);
-    return v_float32x4(a0[0], b0[0], c0[0], d0[0]);
+    a0 = vfredosum_vs_f32m1_f32m1(a0, a.val, a0, 4);
+    b0 = vfredosum_vs_f32m1_f32m1(b0, b.val, b0, 4);
+    c0 = vfredosum_vs_f32m1_f32m1(c0, c.val, c0, 4);
+    d0 = vfredosum_vs_f32m1_f32m1(d0, d.val, d0, 4);
+    vfloat32m1_t res;
+    res = vslideup_vx_f32m1(a0, b0, 1, 4);
+    res = vslideup_vx_f32m1(res, c0, 2, 4);
+    res = vslideup_vx_f32m1(res, d0, 3, 4);
+    return v_float32x4(res);
 }
 
 inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
@@ -957,8 +1156,8 @@ inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
     vfloat32m1_t x = vfsub_vv_f32m1(a.val, b.val, 4);
     vbool32_t mask=vmflt_vf_f32m1_b32(x, 0, 4);
     vfloat32m1_t val = vfrsub_vf_f32m1_m(mask, x, x, 0, 4);
-    a0 = vfredsum_vs_f32m1_f32m1(a0, val, a0, 4);
-    return a0[0];
+    a0 = vfredosum_vs_f32m1_f32m1(a0, val, a0, 4);
+    return vfmv_f_s_f32m1_f32(a0);
 }
 
 #define OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(_Tpvec, _Tpvec2) \
@@ -1020,43 +1219,43 @@ inline v_float32x4 operator == (const v_float32x4& a, const v_float32x4& b)
 {
     vbool32_t mask = vmfeq_vv_f32m1_b32(a.val, b.val, 4);
     vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
-    return v_float32x4((vfloat32m1_t)res);
+    return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
 }
 inline v_float32x4 operator != (const v_float32x4& a, const v_float32x4& b)
 {
     vbool32_t mask = vmfne_vv_f32m1_b32(a.val, b.val, 4);
     vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
-    return v_float32x4((vfloat32m1_t)res);
+    return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
 }
 inline v_float32x4 operator < (const v_float32x4& a, const v_float32x4& b)
 {
     vbool32_t mask = vmflt_vv_f32m1_b32(a.val, b.val, 4);
     vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
-    return v_float32x4((vfloat32m1_t)res);
+    return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
 }
 inline v_float32x4 operator <= (const v_float32x4& a, const v_float32x4& b)
 {
     vbool32_t mask = vmfle_vv_f32m1_b32(a.val, b.val, 4);
     vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
-    return v_float32x4((vfloat32m1_t)res);
+    return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
 }
 inline v_float32x4 operator > (const v_float32x4& a, const v_float32x4& b)
 {
     vbool32_t mask = vmfgt_vv_f32m1_b32(a.val, b.val, 4);
     vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
-    return v_float32x4((vfloat32m1_t)res);
+    return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
 }
 inline v_float32x4 operator >= (const v_float32x4& a, const v_float32x4& b)
 {
     vbool32_t mask = vmfge_vv_f32m1_b32(a.val, b.val, 4);
     vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
-    return v_float32x4((vfloat32m1_t)res);
-}
+    return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
+}/**/
 inline v_float32x4 v_not_nan(const v_float32x4& a)
 {
-    vbool32_t mask = vmford_vv_f32m1_b32(a.val, a.val, 4);
+    vbool32_t mask = vmfeq_vv_f32m1_b32(a.val, a.val, 4);
     vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
-    return v_float32x4((vfloat32m1_t)res);
+    return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
 }
 
 //TODO: ==
@@ -1064,43 +1263,43 @@ inline v_float64x2 operator == (const v_float64x2& a, const v_float64x2& b)
 {
     vbool64_t mask = vmfeq_vv_f64m1_b64(a.val, b.val, 2);
     vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
-    return v_float64x2((vfloat64m1_t)res);
+    return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
 }
 inline v_float64x2 operator != (const v_float64x2& a, const v_float64x2& b)
 {
     vbool64_t mask = vmfne_vv_f64m1_b64(a.val, b.val, 2);
     vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
-    return v_float64x2((vfloat64m1_t)res);
+    return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
 }
 inline v_float64x2 operator < (const v_float64x2& a, const v_float64x2& b)
 {
     vbool64_t mask = vmflt_vv_f64m1_b64(a.val, b.val, 2);
     vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
-    return v_float64x2((vfloat64m1_t)res);
+    return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
 }
 inline v_float64x2 operator <= (const v_float64x2& a, const v_float64x2& b)
 {
     vbool64_t mask = vmfle_vv_f64m1_b64(a.val, b.val, 2);
     vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
-    return v_float64x2((vfloat64m1_t)res);
+    return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
 }
 inline v_float64x2 operator > (const v_float64x2& a, const v_float64x2& b)
 {
     vbool64_t mask = vmfgt_vv_f64m1_b64(a.val, b.val, 2);
     vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
-    return v_float64x2((vfloat64m1_t)res);
+    return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
 }
 inline v_float64x2 operator >= (const v_float64x2& a, const v_float64x2& b)
 {
     vbool64_t mask = vmfge_vv_f64m1_b64(a.val, b.val, 2);
     vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
-    return v_float64x2((vfloat64m1_t)res);
-}
+    return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
+}/**/
 inline v_float64x2 v_not_nan(const v_float64x2& a)
 {
-    vbool64_t mask = vmford_vv_f64m1_b64(a.val, a.val, 2);
+    vbool64_t mask = vmfeq_vv_f64m1_b64(a.val, a.val, 2);
     vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
-    return v_float64x2((vfloat64m1_t)res);
+    return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
 }
 #define OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(_Tp, _T) \
 inline void v_transpose4x4(const v_##_Tp##32x4& a0, const v_##_Tp##32x4& a1, \
@@ -1108,16 +1307,23 @@ inline void v_transpose4x4(const v_##_Tp##32x4& a0, const v_##_Tp##32x4& a1, \
                          v_##_Tp##32x4& b0, v_##_Tp##32x4& b1, \
                          v_##_Tp##32x4& b2, v_##_Tp##32x4& b3) \
 { \
+    vuint32m4_t vindex = vundefined_u32m4(); \
+    vuint32m1_t vindex0 = vid_v_u32m1(4); \
+    vindex0 = vsll_vx_u32m1(vindex0, 2, 4); \
+    vindex = vset_v_u32m1_u32m4(vindex, 0, vindex0); \
+    vindex = vset_v_u32m1_u32m4(vindex, 1, vadd_vx_u32m1(vindex0, 1, 4)); \
+    vindex = vset_v_u32m1_u32m4(vindex, 2, vadd_vx_u32m1(vindex0, 2, 4)); \
+    vindex = vset_v_u32m1_u32m4(vindex, 3, vadd_vx_u32m1(vindex0, 3, 4)); \
     v##_Tp##32m4_t val = vundefined_##_T##m4();    \
-    val = vset_##_T##m4(val, 0, a0.val);    \
-    val = vset_##_T##m4(val, 1, a1.val);    \
-    val = vset_##_T##m4(val, 2, a2.val);    \
-    val = vset_##_T##m4(val, 3, a3.val);   \
-    val = vrgather_vv_##_T##m4(val, (vuint32m4_t){0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}, 16);    \
-    b0.val = vget_##_T##m4_##_T##m1(val, 0);   \
-    b1.val = vget_##_T##m4_##_T##m1(val, 1);   \
-    b2.val = vget_##_T##m4_##_T##m1(val, 2);   \
-    b3.val = vget_##_T##m4_##_T##m1(val, 3);   \
+    val = vset_v_##_T##m1_##_T##m4(val, 0, a0.val);    \
+    val = vset_v_##_T##m1_##_T##m4(val, 1, a1.val);    \
+    val = vset_v_##_T##m1_##_T##m4(val, 2, a2.val);    \
+    val = vset_v_##_T##m1_##_T##m4(val, 3, a3.val);   \
+    val = vrgather_vv_##_T##m4(val, vindex, 16);    \
+    b0.val = vget_v_##_T##m4_##_T##m1(val, 0);   \
+    b1.val = vget_v_##_T##m4_##_T##m1(val, 1);   \
+    b2.val = vget_v_##_T##m4_##_T##m1(val, 2);   \
+    b3.val = vget_v_##_T##m4_##_T##m1(val, 3);   \
 }
 OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(uint, u32)
 OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(int, i32)
@@ -1167,25 +1373,28 @@ template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
 } \
 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
 {     \
-        return _Tpvec(vslidedown_vx_##_T##m1(a.val, n, num));\
+        suffix##m1_t res = vundefined_##_T##m1(); \
+        return _Tpvec(vslidedown_vx_##_T##m1(res, a.val, n, num));\
 } \
 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
 { return a; } \
 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
 { \
     suffix##m2_t tmp = vundefined_##_T##m2();    \
-    tmp = vset_##_T##m2(tmp, 0, a.val);          \
-    tmp = vset_##_T##m2(tmp, 1, b.val);          \
-        tmp = vslidedown_vx_##_T##m2(tmp, n, num2);\
-        return _Tpvec(vget_##_T##m2_##_T##m1(tmp, 0));\
+    suffix##m2_t res = vundefined_##_T##m2();    \
+    tmp = vset_v_##_T##m1_##_T##m2(tmp, 0, a.val);          \
+    tmp = vset_v_##_T##m1_##_T##m2(tmp, 1, b.val);          \
+        res = vslidedown_vx_##_T##m2(res, tmp, n, num2);\
+        return _Tpvec(vget_v_##_T##m2_##_T##m1(res, 0));\
 } \
 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
 { \
     suffix##m2_t tmp = vundefined_##_T##m2();    \
-    tmp = vset_##_T##m2(tmp, 0, b.val);    \
-    tmp = vset_##_T##m2(tmp, 1, a.val);    \
-        tmp = vslideup_vx_##_T##m2(tmp, n, num2);\
-        return _Tpvec(vget_##_T##m2_##_T##m1(tmp, 1));\
+    suffix##m2_t res = vundefined_##_T##m2();    \
+    tmp = vset_v_##_T##m1_##_T##m2(tmp, 0, b.val);    \
+    tmp = vset_v_##_T##m1_##_T##m2(tmp, 1, a.val);    \
+        res = vslideup_vx_##_T##m2(res, tmp, n, num2);\
+        return _Tpvec(vget_v_##_T##m2_##_T##m1(res, 1));\
 } \
 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
 { \
@@ -1203,50 +1412,132 @@ OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_int64x2, vint64, i64, 2, 4, vmv_v_x, b64)
 OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_float32x4, vfloat32, f32, 4, 8, vfmv_v_f, b32)
 OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_float64x2, vfloat64, f64, 2, 4, vfmv_v_f, b64)
 
-#define OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(_Tpvec, _Tp, _Tp2, len, hnum, num) \
+#if 1
+#define vreinterpret_v_i8m1_i8m1
+#define vreinterpret_v_u8m1_u8m1
+#define OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(_Tpvec, _Tp, _Tp2, len, hnum, num, elemsize, ldst_len, ldst_type) \
 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
 { \
-  typedef uint64 CV_DECL_ALIGNED(1) unaligned_uint64; \
-  vuint64m1_t tmp = {*(unaligned_uint64*)ptr0, *(unaligned_uint64*)ptr1};\
-    return _Tpvec(_Tp2##_t(tmp)); } \
+  _Tp2##_t res = vundefined_##len(); \
+  _Tp2##_t res1 = vundefined_##len(); \
+  res = vreinterpret_v_##ldst_len##_##len(vle8_v_##ldst_len((ldst_type *)ptr0, 8)); \
+  res1 = vreinterpret_v_##ldst_len##_##len(vle8_v_##ldst_len((ldst_type *)ptr1, 8)); \
+  res = vslideup_vx_##len(res, res1, hnum, num); \
+  return _Tpvec(res); } \
 inline _Tpvec v_load_low(const _Tp* ptr) \
-{ return _Tpvec(vle_v_##len(ptr, hnum)); }\
+{ return _Tpvec(vreinterpret_v_##ldst_len##_##len(vle8_v_##ldst_len((ldst_type *)ptr, 8))); }\
 inline _Tpvec v_load_aligned(const _Tp* ptr) \
-{ return _Tpvec(vle_v_##len(ptr, num)); } \
+{ return _Tpvec(vreinterpret_v_##ldst_len##_##len(vle8_v_##ldst_len((ldst_type *)ptr, 16))); } \
 inline _Tpvec v_load(const _Tp* ptr) \
-{ return _Tpvec((_Tp2##_t)vle_v_##len((const _Tp *)ptr, num)); } \
+{ return _Tpvec(vreinterpret_v_##ldst_len##_##len(vle8_v_##ldst_len((ldst_type *)ptr, 16))); } \
 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
-{ vse_v_##len(ptr, a.val, hnum);}\
+{ vse8_v_##ldst_len((ldst_type *)ptr, vreinterpret_v_##len##_##ldst_len(a.val), 8);}\
 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
 { \
-  _Tp2##_t a0 = vslidedown_vx_##len(a.val, hnum, num);    \
-  vse_v_##len(ptr, a0, hnum);}\
+  _Tp2##_t a0 = vundefined_##len(); \
+  a0 = vslidedown_vx_##len(a0, a.val, hnum, num);    \
+  vse8_v_##ldst_len((ldst_type *)ptr, vreinterpret_v_##len##_##ldst_len(a0), 8);}\
 inline void v_store(_Tp* ptr, const _Tpvec& a) \
-{ vse_v_##len(ptr, a.val, num); } \
+{ vse8_v_##ldst_len((ldst_type *)ptr, vreinterpret_v_##len##_##ldst_len(a.val), 16); } \
 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
-{ vse_v_##len(ptr, a.val, num); } \
+{ vse8_v_##ldst_len((ldst_type *)ptr, vreinterpret_v_##len##_##ldst_len(a.val), 16); } \
 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
-{ vse_v_##len(ptr, a.val, num); } \
+{ vse8_v_##ldst_len((ldst_type *)ptr, vreinterpret_v_##len##_##ldst_len(a.val), 16); } \
 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
-{ vse_v_##len(ptr, a.val, num); }
+{ vse8_v_##ldst_len((ldst_type *)ptr, vreinterpret_v_##len##_##ldst_len(a.val), 16); }
 
-OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint8x16, uchar, vuint8m1, u8m1, 8, 16)
-OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int8x16,  schar, vint8m1, i8m1, 8, 16)
-OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint16x8, ushort, vuint16m1, u16m1, 4, 8)
-OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int16x8,  short,  vint16m1, i16m1, 4, 8)
-OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint32x4, unsigned, vuint32m1, u32m1, 2, 4)
-OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int32x4,  int,     vint32m1, i32m1, 2, 4)
-OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint64x2, unsigned long, vuint64m1, u64m1, 1, 2)
-OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int64x2,  long,     vint64m1, i64m1, 1, 2)
-OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_float32x4, float, vfloat32m1, f32m1, 2, 4)
-OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_float64x2, double, vfloat64m1, f64m1, 1, 2)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint8x16, uchar, vuint8m1, u8m1, 8, 16, 8, u8m1, uchar)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int8x16,  schar, vint8m1, i8m1, 8, 16, 8, i8m1, schar)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint16x8, ushort, vuint16m1, u16m1, 4, 8, 16, u8m1, uchar)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int16x8,  short,  vint16m1, i16m1, 4, 8, 16, i8m1, schar)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint32x4, unsigned, vuint32m1, u32m1, 2, 4, 32, u8m1, uchar)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int32x4,  int,     vint32m1, i32m1, 2, 4, 32, i8m1, schar)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint64x2, unsigned long, vuint64m1, u64m1, 1, 2, 64, u8m1, uchar)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int64x2,  long,     vint64m1, i64m1, 1, 2, 64, i8m1, schar)
 
+#define OPENCV_HAL_IMPL_RISCVV_LOADSTORE_FLOAT_OP(_Tpvec, _Tp, _Tp2, len, hnum, num, elemsize) \
+inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
+{ \
+  _Tp2##_t res = vundefined_##len(); \
+  _Tp2##_t res1 = vundefined_##len(); \
+  res = vreinterpret_v_u##elemsize##m1_##len(vreinterpret_v_u8m1_u##elemsize##m1(vle8_v_u8m1((uchar *)ptr0, 8))); \
+  res1 = vreinterpret_v_u##elemsize##m1_##len(vreinterpret_v_u8m1_u##elemsize##m1(vle8_v_u8m1((uchar *)ptr1, 8))); \
+  res = vslideup_vx_##len(res, res1, hnum, num); \
+  return _Tpvec(res); } \
+inline _Tpvec v_load_low(const _Tp* ptr) \
+{ return _Tpvec(vreinterpret_v_u##elemsize##m1_##len(vreinterpret_v_u8m1_u##elemsize##m1(vle8_v_u8m1((uchar *)ptr, 8)))); }\
+inline _Tpvec v_load_aligned(const _Tp* ptr) \
+{ return _Tpvec(vreinterpret_v_u##elemsize##m1_##len(vreinterpret_v_u8m1_u##elemsize##m1(vle8_v_u8m1((uchar *)ptr, 16)))); } \
+inline _Tpvec v_load(const _Tp* ptr) \
+{ return _Tpvec(vreinterpret_v_u##elemsize##m1_##len(vreinterpret_v_u8m1_u##elemsize##m1(vle8_v_u8m1((uchar *)ptr, 16)))); } \
+inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
+{ vse8_v_u8m1((uchar *)ptr, vreinterpret_v_u##elemsize##m1_u8m1(vreinterpret_v_##len##_u##elemsize##m1(a.val)), 8);}\
+inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
+{ \
+  _Tp2##_t a0 = vundefined_##len(); \
+  a0 = vslidedown_vx_##len(a0, a.val, hnum, num);    \
+  vse8_v_u8m1((uchar *)ptr, vreinterpret_v_u##elemsize##m1_u8m1(vreinterpret_v_##len##_u##elemsize##m1(a0)), 8);}\
+inline void v_store(_Tp* ptr, const _Tpvec& a) \
+{ vse8_v_u8m1((uchar *)ptr, vreinterpret_v_u##elemsize##m1_u8m1(vreinterpret_v_##len##_u##elemsize##m1(a.val)), 16); } \
+inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
+{ vse8_v_u8m1((uchar *)ptr, vreinterpret_v_u##elemsize##m1_u8m1(vreinterpret_v_##len##_u##elemsize##m1(a.val)), 16); } \
+inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
+{ vse8_v_u8m1((uchar *)ptr, vreinterpret_v_u##elemsize##m1_u8m1(vreinterpret_v_##len##_u##elemsize##m1(a.val)), 16); } \
+inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
+{ vse8_v_u8m1((uchar *)ptr, vreinterpret_v_u##elemsize##m1_u8m1(vreinterpret_v_##len##_u##elemsize##m1(a.val)), 16); }
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_FLOAT_OP(v_float32x4, float, vfloat32m1, f32m1, 2, 4, 32)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_FLOAT_OP(v_float64x2, double, vfloat64m1, f64m1, 1, 2, 64)
+
+#else
+
+#define OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(_Tpvec, _Tp, _Tp2, len, hnum, num, elemsize) \
+inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
+{ \
+  _Tp2##_t res, res1; \
+  res = vle##elemsize##_v_##len(ptr0, hnum); \
+  res1 = vle##elemsize##_v_##len(ptr1, hnum); \
+  res = vslideup_vx_##len(res, res1, hnum, num); \
+  return _Tpvec(res); } \
+inline _Tpvec v_load_low(const _Tp* ptr) \
+{ return _Tpvec(vle##elemsize##_v_##len(ptr, hnum)); }\
+inline _Tpvec v_load_aligned(const _Tp* ptr) \
+{ return _Tpvec(vle##elemsize##_v_##len(ptr, num)); } \
+inline _Tpvec v_load(const _Tp* ptr) \
+{ return _Tpvec((_Tp2##_t)vle##elemsize##_v_##len((const _Tp *)ptr, num)); } \
+inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
+{ vse##elemsize##_v_##len(ptr, a.val, hnum);}\
+inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
+{ \
+  _Tp2##_t a0; \
+  a0 = vslidedown_vx_##len(a0, a.val, hnum, num);    \
+  vse##elemsize##_v_##len(ptr, a0, hnum);}\
+inline void v_store(_Tp* ptr, const _Tpvec& a) \
+{ vse##elemsize##_v_##len(ptr, a.val, num); } \
+inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
+{ vse##elemsize##_v_##len(ptr, a.val, num); } \
+inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
+{ vse##elemsize##_v_##len(ptr, a.val, num); } \
+inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
+{ vse##elemsize##_v_##len(ptr, a.val, num); }
+
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint8x16, uchar, vuint8m1, u8m1, 8, 16, 8)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int8x16,  schar, vint8m1, i8m1, 8, 16, 8)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint16x8, ushort, vuint16m1, u16m1, 4, 8, 16)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int16x8,  short,  vint16m1, i16m1, 4, 8, 16)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint32x4, unsigned, vuint32m1, u32m1, 2, 4, 32)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int32x4,  int,     vint32m1, i32m1, 2, 4, 32)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint64x2, unsigned long, vuint64m1, u64m1, 1, 2, 64)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int64x2,  long,     vint64m1, i64m1, 1, 2, 64)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_float32x4, float, vfloat32m1, f32m1, 2, 4, 32)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_float64x2, double, vfloat64m1, f64m1, 1, 2, 64)
+
+#endif
 
 ////////////// Lookup table access ////////////////////
 
 inline v_int8x16 v_lut(const schar* tab, const int* idx)
 {
-#if 1
+#if 0
     schar CV_DECL_ALIGNED(32) elems[16] =
     {
         tab[idx[ 0]],
@@ -1266,16 +1557,18 @@ inline v_int8x16 v_lut(const schar* tab, const int* idx)
         tab[idx[14]],
         tab[idx[15]]
     };
-    return v_int8x16(vle_v_i8m1(elems, 16));
+    return v_int8x16(vle8_v_i8m1(elems, 16));
 #else
-    int32xm4_t index32 = vlev_int32xm4(idx, 16);
-    vint16m2_t index16 = vnsra_vx_i16m2_int32xm4(index32, 0, 16);
-    vint8m1_t index = vnsra_vx_i8m1_i16m2(index16, 0, 16);
-    return v_int8x16(vlxbv_i8m1(tab, index, 16));
+#if __riscv_v == 7000
+    return v_int8x16(vnclip_wx_i8m1(vnclip_wx_i16m2(vlxb_v_i32m4((const int *)tab, vle32_v_u32m4((unsigned int *)idx, 16), 16), 0, 16), 0, 16));
+#else
+    return v_int8x16(vloxei32_v_i8m1(tab, vle32_v_u32m4((unsigned int *)idx, 16), 16));
+#endif
 #endif
 }
 
 inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx){
+#if 0
     schar CV_DECL_ALIGNED(32) elems[16] =
     {
         tab[idx[0]],
@@ -1295,10 +1588,24 @@ inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx){
         tab[idx[7]],
         tab[idx[7] + 1]
     };
-    return v_int8x16(vle_v_i8m1(elems, 16));
+    return v_int8x16(vle8_v_i8m1(elems, 16));
+#else
+    vuint32m4_t seq, index;
+    vuint32m4_t vidx = vle32_v_u32m4((unsigned int *)idx, 8);
+    seq = vid_v_u32m4(16);
+    index = vsrl_vx_u32m4(seq, 1, 16);
+    vidx = vrgather_vv_u32m4(vidx, index, 16);
+    index = vadd_vv_u32m4(vand_vx_u32m4(seq, 1, 16), vidx, 16);
+#if __riscv_v == 7000
+    return v_int8x16(vnclip_wx_i8m1(vnclip_wx_i16m2(vlxb_v_i32m4((const int *)tab, index, 16), 0, 16), 0, 16));
+#else
+    return v_int8x16(vloxei32_v_i8m1(tab, index, 16));
+#endif
+#endif
 }
 inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
 {
+#if 0
     schar CV_DECL_ALIGNED(32) elems[16] =
     {
         tab[idx[0]],
@@ -1318,7 +1625,23 @@ inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
         tab[idx[3] + 2],
         tab[idx[3] + 3]
     };
-    return v_int8x16(vle_v_i8m1(elems, 16));
+    return v_int8x16(vle8_v_i8m1(elems, 16));
+#else
+    vuint32m4_t seq, index;
+    vuint32m4_t vidx = vle32_v_u32m4((unsigned int *)idx, 4);
+    seq = vid_v_u32m4(16);
+    index = vsrl_vx_u32m4(seq, 2, 16);
+    vidx = vrgather_vv_u32m4(vidx, index, 16);
+    seq = vset_v_u32m1_u32m4(seq, 1, vget_v_u32m4_u32m1(seq, 0));
+    seq = vset_v_u32m1_u32m4(seq, 2, vget_v_u32m4_u32m1(seq, 0));
+    seq = vset_v_u32m1_u32m4(seq, 3, vget_v_u32m4_u32m1(seq, 0));
+    index = vadd_vv_u32m4(seq, vidx, 16);
+#if __riscv_v == 7000
+    return v_int8x16(vnclip_wx_i8m1(vnclip_wx_i16m2(vlxb_v_i32m4((const int *)tab, index, 16), 0, 16), 0, 16));
+#else
+    return v_int8x16(vloxei32_v_i8m1(tab, index, 16));
+#endif
+#endif
 }
 
 inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); }
@@ -1327,6 +1650,7 @@ inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reint
 
 inline v_int16x8 v_lut(const short* tab, const int* idx)
 {
+#if 0
     short CV_DECL_ALIGNED(32) elems[8] =
     {
         tab[idx[0]],
@@ -1338,10 +1662,18 @@ inline v_int16x8 v_lut(const short* tab, const int* idx)
         tab[idx[6]],
         tab[idx[7]]
     };
-    return v_int16x8(vle_v_i16m1(elems, 8));
+    return v_int16x8(vle16_v_i16m1(elems, 8));
+#else
+#if __riscv_v == 7000
+    return v_int16x8(vnclip_wx_i16m1(vlxh_v_i32m2((const int *)tab, vsll_vx_u32m2(vle32_v_u32m2((unsigned int *)idx, 8), 1, 8), 8), 0, 8));
+#else
+    return v_int16x8(vloxei32_v_i16m1(tab, vsll_vx_u32m2(vle32_v_u32m2((unsigned int *)idx, 8), 1, 8), 8));
+#endif
+#endif
 }
 inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
 {
+#if 0
     short CV_DECL_ALIGNED(32) elems[8] =
     {
         tab[idx[0]],
@@ -1353,10 +1685,24 @@ inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
         tab[idx[3]],
         tab[idx[3] + 1]
     };
-    return v_int16x8(vle_v_i16m1(elems, 8));
+    return v_int16x8(vle16_v_i16m1(elems, 8));
+#else
+    vuint32m2_t seq, index;
+    vuint32m2_t vidx = vle32_v_u32m2((unsigned int *)idx, 4);
+    seq = vid_v_u32m2(8);
+    index = vsrl_vx_u32m2(seq, 1, 8);
+    vidx = vrgather_vv_u32m2(vidx, index, 8);
+    index = vsll_vx_u32m2(vadd_vv_u32m2(vand_vx_u32m2(seq, 1, 8), vidx, 8), 1, 8);
+#if __riscv_v == 7000
+    return v_int16x8(vnclip_wx_i16m1(vlxh_v_i32m2((const int *)tab, index, 8), 0, 8));
+#else
+    return v_int16x8(vloxei32_v_i16m1(tab, index, 8));
+#endif
+#endif
 }
 inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
 {
+#if 0
     short CV_DECL_ALIGNED(32) elems[8] =
     {
         tab[idx[0]],
@@ -1368,7 +1714,21 @@ inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
         tab[idx[1] + 2],
         tab[idx[1] + 3]
     };
-    return v_int16x8(vle_v_i16m1(elems, 8));
+    return v_int16x8(vle16_v_i16m1(elems, 8));
+#else
+    vuint32m2_t seq, index;
+    vuint32m2_t vidx = vle32_v_u32m2((unsigned int *)idx, 2);
+    seq = vid_v_u32m2(8);
+    index = vsrl_vx_u32m2(seq, 2, 8);
+    vidx = vrgather_vv_u32m2(vidx, index, 8);
+    seq = vset_v_u32m1_u32m2(seq, 1, vget_v_u32m2_u32m1(seq, 0));
+    index = vsll_vx_u32m2(vadd_vv_u32m2(seq, vidx, 8), 1, 8);
+#if __riscv_v == 7000
+    return v_int16x8(vnclip_wx_i16m1(vlxh_v_i32m2((const int *)tab, index, 8), 0, 8));
+#else
+    return v_int16x8(vloxei32_v_i16m1(tab, index, 8));
+#endif
+#endif
 }
 inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((short*)tab, idx)); }
 inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((short*)tab, idx)); }
@@ -1376,6 +1736,7 @@ inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_rein
 
 inline v_int32x4 v_lut(const int* tab, const int* idx)
 {
+#if 0
     int CV_DECL_ALIGNED(32) elems[4] =
     {
         tab[idx[0]],
@@ -1383,10 +1744,14 @@ inline v_int32x4 v_lut(const int* tab, const int* idx)
         tab[idx[2]],
         tab[idx[3]]
     };
-    return v_int32x4(vle_v_i32m1(elems, 4));
+    return v_int32x4(vle32_v_i32m1(elems, 4));
+#else
+    return v_int32x4(vloxei32_v_i32m1(tab, vsll_vx_u32m1(vle32_v_u32m1((unsigned int *)idx, 4), 2, 4), 4));
+#endif
 }
 inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
 {
+#if 0
     int CV_DECL_ALIGNED(32) elems[4] =
     {
         tab[idx[0]],
@@ -1394,11 +1759,20 @@ inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
         tab[idx[1]],
         tab[idx[1] + 1]
     };
-    return v_int32x4(vle_v_i32m1(elems, 4));
+    return v_int32x4(vle32_v_i32m1(elems, 4));
+#else
+    vuint32m1_t seq, index;
+    vuint32m1_t vidx = vle32_v_u32m1((unsigned int *)idx, 2);
+    seq = vid_v_u32m1(4);
+    index = vsrl_vx_u32m1(seq, 1, 4);
+    vidx = vrgather_vv_u32m1(vidx, index, 4);
+    index = vsll_vx_u32m1(vadd_vv_u32m1(vand_vx_u32m1(seq, 1, 4), vidx, 4), 2, 4);
+    return v_int32x4(vloxei32_v_i32m1(tab, index, 4));
+#endif
 }
 inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
 {
-    return v_int32x4(vle_v_i32m1(tab+idx[0], 4));
+    return v_int32x4(vle32_v_i32m1(tab+idx[0], 4));
 }
 inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((int*)tab, idx)); }
 inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((int*)tab, idx)); }
@@ -1406,26 +1780,27 @@ inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_re
 
 inline v_int64x2 v_lut(const int64_t* tab, const int* idx)
 {
-    vint64m1_t res = {tab[idx[0]], tab[idx[1]]};
-    return v_int64x2(res);
+    //vint64m1_t res = {tab[idx[0]], tab[idx[1]]};
+    return v_int64x2(vloxei64_v_i64m1(tab, vsll_vx_u64m1(vget_v_u64m2_u64m1(vwaddu_vx_u64m2(vle32_v_u32m1((uint32_t*)idx, 2), 0, 2), 0), 3, 2), 2));
 }
 inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx)
 {
-    return v_int64x2(vle_v_i64m1(tab+idx[0], 2));
+    return v_int64x2(vle64_v_i64m1(tab+idx[0], 2));
 }
 
 inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx)
 {
-    vuint64m1_t res = {tab[idx[0]], tab[idx[1]]};
-    return v_uint64x2(res);
+    //vuint64m1_t res = {tab[idx[0]], tab[idx[1]]};
+    return v_uint64x2(vloxei64_v_u64m1(tab, vsll_vx_u64m1(vget_v_u64m2_u64m1(vwaddu_vx_u64m2(vle32_v_u32m1((uint32_t*)idx, 2), 0, 2), 0), 3, 2), 2));
 }
 inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx)
 {
-    return v_uint64x2(vle_v_u64m1(tab+idx[0], 2));
+    return v_uint64x2(vle64_v_u64m1(tab+idx[0], 2));
 }
 
 inline v_float32x4 v_lut(const float* tab, const int* idx)
 {
+#if 0
     float CV_DECL_ALIGNED(32) elems[4] =
     {
         tab[idx[0]],
@@ -1433,10 +1808,14 @@ inline v_float32x4 v_lut(const float* tab, const int* idx)
         tab[idx[2]],
         tab[idx[3]]
     };
-    return v_float32x4(vle_v_f32m1(elems, 4));
+    return v_float32x4(vle32_v_f32m1(elems, 4));
+#else
+    return v_float32x4(vloxei32_v_f32m1(tab, vsll_vx_u32m1(vle32_v_u32m1((unsigned int *)idx, 4), 2, 4), 4));
+#endif
 }
 inline v_float32x4 v_lut_pairs(const float* tab, const int* idx)
 {
+#if 0
     float CV_DECL_ALIGNED(32) elems[4] =
     {
         tab[idx[0]],
@@ -1444,69 +1823,79 @@ inline v_float32x4 v_lut_pairs(const float* tab, const int* idx)
         tab[idx[1]],
         tab[idx[1]+1]
     };
-    return v_float32x4(vle_v_f32m1(elems, 4));
+    return v_float32x4(vle32_v_f32m1(elems, 4));
+#else
+    vuint32m1_t seq, index;
+    vuint32m1_t vidx = vle32_v_u32m1((unsigned int *)idx, 2);
+    seq = vid_v_u32m1(4);
+    index = vsrl_vx_u32m1(seq, 1, 4);
+    vidx = vrgather_vv_u32m1(vidx, index, 4);
+    index = vsll_vx_u32m1(vadd_vv_u32m1(vand_vx_u32m1(seq, 1, 4), vidx, 4), 2, 4);
+    return v_float32x4(vloxei32_v_f32m1(tab, index, 4));
+#endif
 }
 inline v_float32x4 v_lut_quads(const float* tab, const int* idx)
 {
-    return v_float32x4(vle_v_f32m1(tab + idx[0], 4));
+    return v_float32x4(vle32_v_f32m1(tab + idx[0], 4));
 }
 inline v_float64x2 v_lut(const double* tab, const int* idx)
 {
-    vfloat64m1_t res = {tab[idx[0]], tab[idx[1]]};
-    return v_float64x2(res);
+    //vfloat64m1_t res = {tab[idx[0]], tab[idx[1]]};
+    return v_float64x2(vloxei64_v_f64m1(tab, vsll_vx_u64m1(vget_v_u64m2_u64m1(vwaddu_vx_u64m2(vle32_v_u32m1((uint32_t*)idx, 2), 0, 2), 0), 3, 2), 2));
 }
 inline v_float64x2 v_lut_pairs(const double* tab, const int* idx)
 {
-    return v_float64x2(vle_v_f64m1(tab+idx[0], 2));
+    return v_float64x2(vle64_v_f64m1(tab+idx[0], 2));
 }
 
 inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
 {
-    int CV_DECL_ALIGNED(32) elems[4] =
+    /*int CV_DECL_ALIGNED(32) elems[4] =
     {
         tab[idxvec.val[0]],
         tab[idxvec.val[1]],
         tab[idxvec.val[2]],
         tab[idxvec.val[3]]
-    };
-    return v_int32x4(vle_v_i32m1(elems, 4));
+    };*/
+    return v_int32x4(vloxei32_v_i32m1(tab, vsll_vx_u32m1(vreinterpret_v_i32m1_u32m1(idxvec.val), 2, 4), 4));
 }
 
 inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
 {
-    unsigned CV_DECL_ALIGNED(32) elems[4] =
+    /*unsigned CV_DECL_ALIGNED(32) elems[4] =
     {
         tab[idxvec.val[0]],
         tab[idxvec.val[1]],
         tab[idxvec.val[2]],
         tab[idxvec.val[3]]
-    };
-    return v_uint32x4(vle_v_u32m1(elems, 4));
+    };*/
+    return v_uint32x4(vloxei32_v_u32m1(tab, vsll_vx_u32m1(vreinterpret_v_i32m1_u32m1(idxvec.val), 2, 4), 4));
 }
 
 inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
 {
-    float CV_DECL_ALIGNED(32) elems[4] =
+    /*float CV_DECL_ALIGNED(32) elems[4] =
     {
         tab[idxvec.val[0]],
         tab[idxvec.val[1]],
         tab[idxvec.val[2]],
         tab[idxvec.val[3]]
-    };
-    return v_float32x4(vle_v_f32m1(elems, 4));
+    };*/
+    return v_float32x4(vloxei32_v_f32m1(tab, vsll_vx_u32m1(vreinterpret_v_i32m1_u32m1(idxvec.val), 2, 4), 4));
 }
 inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
 {
-    vfloat64m1_t res = {tab[idxvec.val[0]], tab[idxvec.val[1]]};
-    return v_float64x2(res);
+    //vfloat64m1_t res = {tab[idxvec.val[0]], tab[idxvec.val[1]]};
+    return v_float64x2(vloxei64_v_f64m1(tab, vsll_vx_u64m1(vreinterpret_v_i64m1_u64m1(vget_v_i64m2_i64m1(vwadd_vx_i64m2(idxvec.val, 0, 2), 0)), 3, 2), 2));
 }
 inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
 {
-    vint32m1_t index_x = vmul_vx_i32m1(idxvec.val, 4, 4);
-    vint32m1_t index_y = vadd_vx_i32m1(index_x, 4, 4);
+    vint32m1_t index = vmul_vx_i32m1(idxvec.val, 4, 4);
+    //vint32m1_t index_y = vadd_vx_i32m1(index_x, 4, 4);
 
-    x.val = vlxe_v_f32m1(tab, index_x, 4);
-    y.val = vlxe_v_f32m1(tab, index_y, 4);
+    //x.val = vlxe_v_f32m1(tab, index_x, 4);
+    //y.val = vlxe_v_f32m1(tab, index_y, 4);
+    vloxseg2ei32_v_f32m1(&x.val, &y.val, tab, vreinterpret_v_i32m1_u32m1(index), 4);
 }
 
 inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
@@ -1518,52 +1907,52 @@ inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_flo
     y = v_float64x2(tab[idx[0]+1], tab[idx[1]+1]);
 }
 
-#define OPENCV_HAL_IMPL_RISCVV_PACKS(_Tp, _Tp2, _T2, num2, _T1, num, intrin, shr, _Type) \
+#define OPENCV_HAL_IMPL_RISCVV_PACKS(_Tp, _Tp2, _T2, num2, _T1, num, intrin, shr, _Type, elemsize) \
 inline v_##_Tp##x##num v_pack(const v_##_Tp2##x##num2& a, const v_##_Tp2##x##num2& b) \
 { \
     v##_Tp2##m2_t  tmp = vundefined_##_T2##m2();    \
-    tmp = vset_##_T2##m2(tmp, 0, a.val);    \
-    tmp = vset_##_T2##m2(tmp, 1, b.val);    \
+    tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 0, a.val);    \
+    tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 1, b.val);    \
     return v_##_Tp##x##num(shr##_##_T1##m1(tmp, 0, num)); \
 }\
 template<int n> inline \
 v_##_Tp##x##num v_rshr_pack(const v_##_Tp2##x##num2& a, const v_##_Tp2##x##num2& b) \
 { \
     v##_Tp2##m2_t  tmp = vundefined_##_T2##m2();    \
-    tmp = vset_##_T2##m2(tmp, 0, a.val);    \
-    tmp = vset_##_T2##m2(tmp, 1, b.val);    \
+    tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 0, a.val);    \
+    tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 1, b.val);    \
     return v_##_Tp##x##num(intrin##_##_T1##m1(tmp, n, num)); \
 }\
 inline void v_pack_store(_Type* ptr, const v_##_Tp2##x##num2& a) \
 { \
     v##_Tp2##m2_t tmp = vundefined_##_T2##m2();    \
-    tmp = vset_##_T2##m2(tmp, 0, a.val);    \
-    tmp = vset_##_T2##m2(tmp, 1, vmv_v_x_##_T2##m1(0, num2));    \
+    tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 0, a.val);    \
+    tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 1, vmv_v_x_##_T2##m1(0, num2));    \
     asm("" ::: "memory");                                       \
-    vse_v_##_T1##m1(ptr, shr##_##_T1##m1(tmp, 0, num), num2); \
+    vse##elemsize##_v_##_T1##m1(ptr, shr##_##_T1##m1(tmp, 0, num), num2); \
 }\
 template<int n> inline \
 void v_rshr_pack_store(_Type* ptr, const v_##_Tp2##x##num2& a) \
 { \
     v##_Tp2##m2_t tmp = vundefined_##_T2##m2();    \
-    tmp = vset_##_T2##m2(tmp, 0, a.val);    \
-    tmp = vset_##_T2##m2(tmp, 1, vmv_v_x_##_T2##m1(0, num2));    \
-    vse_v_##_T1##m1(ptr, intrin##_##_T1##m1(tmp, n, num), num2); \
+    tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 0, a.val);    \
+    tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 1, vmv_v_x_##_T2##m1(0, num2));    \
+    vse##elemsize##_v_##_T1##m1(ptr, intrin##_##_T1##m1(tmp, n, num), num2); \
 }
-OPENCV_HAL_IMPL_RISCVV_PACKS(int8, int16, i16, 8, i8, 16, vnclip_vx, vnclip_vx, signed char)
-OPENCV_HAL_IMPL_RISCVV_PACKS(int16, int32, i32, 4, i16, 8, vnclip_vx, vnclip_vx, signed short)
-OPENCV_HAL_IMPL_RISCVV_PACKS(int32, int64, i64, 2, i32, 4, vnclip_vx, vnsra_vx, int)
-OPENCV_HAL_IMPL_RISCVV_PACKS(uint8, uint16, u16, 8, u8, 16, vnclipu_vx, vnclipu_vx, unsigned char)
-OPENCV_HAL_IMPL_RISCVV_PACKS(uint16, uint32, u32, 4, u16, 8, vnclipu_vx, vnclipu_vx, unsigned short)
-OPENCV_HAL_IMPL_RISCVV_PACKS(uint32, uint64, u64, 2, u32, 4, vnclipu_vx, vnsrl_vx, unsigned int)
+OPENCV_HAL_IMPL_RISCVV_PACKS(int8, int16, i16, 8, i8, 16, vnclip_wx, vnclip_wx, signed char, 8)
+OPENCV_HAL_IMPL_RISCVV_PACKS(int16, int32, i32, 4, i16, 8, vnclip_wx, vnclip_wx, signed short, 16)
+OPENCV_HAL_IMPL_RISCVV_PACKS(int32, int64, i64, 2, i32, 4, vnclip_wx, vnsra_wx, int, 32)
+OPENCV_HAL_IMPL_RISCVV_PACKS(uint8, uint16, u16, 8, u8, 16, vnclipu_wx, vnclipu_wx, unsigned char, 8)
+OPENCV_HAL_IMPL_RISCVV_PACKS(uint16, uint32, u32, 4, u16, 8, vnclipu_wx, vnclipu_wx, unsigned short, 16)
+OPENCV_HAL_IMPL_RISCVV_PACKS(uint32, uint64, u64, 2, u32, 4, vnclipu_wx, vnsrl_wx, unsigned int, 32)
 
 // pack boolean
 inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
 {
     vuint16m2_t tmp = vundefined_u16m2();    \
-    tmp = vset_u16m2(tmp, 0, a.val);    \
-    tmp = vset_u16m2(tmp, 1, b.val);    \
-    return v_uint8x16(vnsrl_vx_u8m1(tmp, 0, 16));
+    tmp = vset_v_u16m1_u16m2(tmp, 0, a.val);    \
+    tmp = vset_v_u16m1_u16m2(tmp, 1, b.val);    \
+    return v_uint8x16(vnsrl_wx_u8m1(tmp, 0, 16));
 }
 
 inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
@@ -1571,12 +1960,12 @@ inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
 {
     vuint32m4_t vabcd = vundefined_u32m4();    \
     vuint16m2_t v16 = vundefined_u16m2();    \
-    vabcd = vset_u32m4(vabcd, 0, a.val);    \
-    vabcd = vset_u32m4(vabcd, 1, b.val);    \
-    vabcd = vset_u32m4(vabcd, 2, c.val);    \
-    vabcd = vset_u32m4(vabcd, 3, d.val);    \
-    v16 = vnsrl_vx_u16m2(vabcd, 0, 16);
-    return v_uint8x16(vnsrl_vx_u8m1(v16, 0, 16));
+    vabcd = vset_v_u32m1_u32m4(vabcd, 0, a.val);    \
+    vabcd = vset_v_u32m1_u32m4(vabcd, 1, b.val);    \
+    vabcd = vset_v_u32m1_u32m4(vabcd, 2, c.val);    \
+    vabcd = vset_v_u32m1_u32m4(vabcd, 3, d.val);    \
+    v16 = vnsrl_wx_u16m2(vabcd, 0, 16);
+    return v_uint8x16(vnsrl_wx_u8m1(v16, 0, 16));
 }
 
 inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
@@ -1586,17 +1975,17 @@ inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uin
     vuint64m8_t v64 = vundefined_u64m8();    \
     vuint32m4_t v32 = vundefined_u32m4();    \
     vuint16m2_t v16 = vundefined_u16m2();    \
-    v64 = vset_u64m8(v64, 0, a.val);    \
-    v64 = vset_u64m8(v64, 1, b.val);    \
-    v64 = vset_u64m8(v64, 2, c.val);    \
-    v64 = vset_u64m8(v64, 3, d.val);    \
-    v64 = vset_u64m8(v64, 4, e.val);    \
-    v64 = vset_u64m8(v64, 5, f.val);    \
-    v64 = vset_u64m8(v64, 6, g.val);    \
-    v64 = vset_u64m8(v64, 7, h.val);    \
-    v32 = vnsrl_vx_u32m4(v64, 0, 16);
-    v16 = vnsrl_vx_u16m2(v32, 0, 16);
-    return v_uint8x16(vnsrl_vx_u8m1(v16, 0, 16));
+    v64 = vset_v_u64m1_u64m8(v64, 0, a.val);    \
+    v64 = vset_v_u64m1_u64m8(v64, 1, b.val);    \
+    v64 = vset_v_u64m1_u64m8(v64, 2, c.val);    \
+    v64 = vset_v_u64m1_u64m8(v64, 3, d.val);    \
+    v64 = vset_v_u64m1_u64m8(v64, 4, e.val);    \
+    v64 = vset_v_u64m1_u64m8(v64, 5, f.val);    \
+    v64 = vset_v_u64m1_u64m8(v64, 6, g.val);    \
+    v64 = vset_v_u64m1_u64m8(v64, 7, h.val);    \
+    v32 = vnsrl_wx_u32m4(v64, 0, 16);
+    v16 = vnsrl_wx_u16m2(v32, 0, 16);
+    return v_uint8x16(vnsrl_wx_u8m1(v16, 0, 16));
 }
 
 //inline v_uint8x16 v_pack_u(const v_int16x8& a, const v_int16x8& b) \
@@ -1612,35 +2001,35 @@ inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uin
 inline v_uint##tp1##x##num1 v_pack_u(const v_int##tp2##x##num2& a, const v_int##tp2##x##num2& b) \
 { \
     vint##tp2##m2_t tmp = vundefined_##i##tp2##m2();    \
-    tmp = vset_##i##tp2##m2(tmp, 0, a.val);    \
-    tmp = vset_##i##tp2##m2(tmp, 1, b.val);    \
+    tmp = vset_v_##i##tp2##m1_##i##tp2##m2(tmp, 0, a.val);    \
+    tmp = vset_v_##i##tp2##m1_##i##tp2##m2(tmp, 1, b.val);    \
     vint##tp2##m2_t val = vmax_vx_i##tp2##m2(tmp, 0, num1);\
-    return v_uint##tp1##x##num1(vnclipu_vx_u##tp1##m1((vuint##tp2##m2_t)val, 0, num1));    \
+    return v_uint##tp1##x##num1(vnclipu_wx_u##tp1##m1(vreinterpret_v_i##tp2##m2_u##tp2##m2(val), 0, num1));    \
 } \
 inline void v_pack_u_store(_Tp* ptr, const v_int##tp2##x##num2& a) \
 { \
     vint##tp2##m2_t tmp = vundefined_##i##tp2##m2();    \
-    tmp = vset_##i##tp2##m2(tmp, 0, a.val);    \
+    tmp = vset_v_##i##tp2##m1_##i##tp2##m2(tmp, 0, a.val);    \
     vint##tp2##m2_t val = vmax_vx_i##tp2##m2(tmp, 0, num1);\
-    return vse_v_u##tp1##m1(ptr, vnclipu_vx_u##tp1##m1((vuint##tp2##m2_t)val, 0, num1), num2);    \
+    return vse##tp1##_v_u##tp1##m1(ptr, vnclipu_wx_u##tp1##m1(vreinterpret_v_i##tp2##m2_u##tp2##m2(val), 0, num1), num2);    \
 } \
 template<int n> inline \
 v_uint##tp1##x##num1 v_rshr_pack_u(const v_int##tp2##x##num2& a, const v_int##tp2##x##num2& b) \
 { \
     vint##tp2##m2_t tmp = vundefined_##i##tp2##m2();    \
-    tmp = vset_##i##tp2##m2(tmp, 0, a.val);    \
-    tmp = vset_##i##tp2##m2(tmp, 1, b.val);    \
+    tmp = vset_v_##i##tp2##m1_##i##tp2##m2(tmp, 0, a.val);    \
+    tmp = vset_v_##i##tp2##m1_##i##tp2##m2(tmp, 1, b.val);    \
     vint##tp2##m2_t val = vmax_vx_i##tp2##m2(tmp, 0, num1);\
-    return v_uint##tp1##x##num1(vnclipu_vx_u##tp1##m1((vuint##tp2##m2_t)val, n, num1));    \
+    return v_uint##tp1##x##num1(vnclipu_wx_u##tp1##m1(vreinterpret_v_i##tp2##m2_u##tp2##m2(val), n, num1));    \
 } \
 template<int n> inline \
 void v_rshr_pack_u_store(_Tp* ptr, const v_int##tp2##x##num2& a) \
 { \
     vint##tp2##m2_t tmp = vundefined_##i##tp2##m2();    \
-    tmp = vset_##i##tp2##m2(tmp, 0, a.val);    \
+    tmp = vset_v_##i##tp2##m1_##i##tp2##m2(tmp, 0, a.val);    \
     vint##tp2##m2_t val_ = vmax_vx_i##tp2##m2(tmp, 0, num1);\
-    vuint##tp1##m1_t val = vnclipu_vx_u##tp1##m1((vuint##tp2##m2_t)val_, n, num1);    \
-    return vse_v_u##tp1##m1(ptr, val, num2);\
+    vuint##tp1##m1_t val = vnclipu_wx_u##tp1##m1(vreinterpret_v_i##tp2##m2_u##tp2##m2(val_), n, num1);    \
+    return vse##tp1##_v_u##tp1##m1(ptr, val, num2);\
 }
 OPENCV_HAL_IMPL_RISCVV_PACK_U(8, 16, 16, 8, unsigned char )
 OPENCV_HAL_IMPL_RISCVV_PACK_U(16, 8, 32, 4, unsigned short)
@@ -1690,8 +2079,12 @@ static const signed char popCountTable[256] =
 };
 
 inline vuint8m1_t vcnt_u8(vuint8m1_t val){
-    vuint8m1_t v0 = val & 1;
-    return vlxe_v_u8m1((unsigned char*)popCountTable, val >> 1, 16)+v0;
+#if __riscv_v == 7000
+    vuint8m1_t v0 = vand_vx_u8m1(val, 1, 16);
+    return vadd_vv_u8m1(vloxei8_v_u8m1((unsigned char*)popCountTable, vsrl_vx_u8m1(val, 1, 16), 16), v0, 16);
+#else
+    return vloxei8_v_u8m1((unsigned char*)popCountTable, val, 16);
+#endif
 }
 
 inline v_uint8x16
@@ -1703,156 +2096,138 @@ v_popcount(const v_uint8x16& a)
 inline v_uint8x16
 v_popcount(const v_int8x16& a)
 {
-    return v_uint8x16(vcnt_u8((vuint8m1_t)a.val));
+    return v_uint8x16(vcnt_u8(vreinterpret_v_i8m1_u8m1(a.val)));
 }
 
 inline v_uint16x8
 v_popcount(const v_uint16x8& a)
 {
-    vuint8m2_t tmp = vundefined_u8m2();
-    tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val));
-    vuint64m2_t mask = (vuint64m2_t){0x0E0C0A0806040200, 0, 0x0F0D0B0907050301, 0};
-    tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32);    \
-    vuint16m2_t res = vwaddu_vv_u16m2(vget_u8m2_u8m1(tmp, 0), vget_u8m2_u8m1(tmp, 1), 8);
-    return v_uint16x8(vget_u16m2_u16m1(res, 0));
+    vuint8m1_t tmp = vcnt_u8(vreinterpret_v_u16m1_u8m1(a.val));
+    vuint8m1_t seq = vid_v_u8m1(8);
+    vuint8m1_t index = vsll_vx_u8m1(seq, 1, 8);
+    return v_uint16x8(vget_v_u16m2_u16m1(vwaddu_vv_u16m2(vrgather_vv_u8m1(tmp, index, 8), vrgather_vv_u8m1(tmp, vadd_vx_u8m1(index, 1, 8), 8), 8), 0));
 }
 
 inline v_uint16x8
 v_popcount(const v_int16x8& a)
 {
-    vuint8m2_t tmp = vundefined_u8m2();
-    tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val));
-    vuint64m2_t mask = (vuint64m2_t){0x0E0C0A0806040200, 0, 0x0F0D0B0907050301, 0};
-    tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32);    \
-    vuint16m2_t res = vwaddu_vv_u16m2(vget_u8m2_u8m1(tmp, 0), vget_u8m2_u8m1(tmp, 1), 8);
-    return v_uint16x8(vget_u16m2_u16m1(res, 0));
+    vuint8m1_t tmp = vcnt_u8(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i16m1_i8m1(a.val)));
+    vuint8m1_t seq = vid_v_u8m1(8);
+    vuint8m1_t index = vsll_vx_u8m1(seq, 1, 8);
+    return v_uint16x8(vget_v_u16m2_u16m1(vwaddu_vv_u16m2(vrgather_vv_u8m1(tmp, index, 8), vrgather_vv_u8m1(tmp, vadd_vx_u8m1(index, 1, 8), 8), 8), 0));
 }
 
 inline v_uint32x4
 v_popcount(const v_uint32x4& a)
 {
-    vuint8m2_t tmp = vundefined_u8m2();
-    tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val));
-    vuint64m2_t mask = (vuint64m2_t){0xFFFFFFFF0C080400, 0xFFFFFFFF0D090501,
-                     0xFFFFFFFF0E0A0602, 0xFFFFFFFF0F0B0703};
-    tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32);    \
-    vuint16m2_t res_ = vwaddu_vv_u16m2(vget_u8m2_u8m1(tmp, 0), vget_u8m2_u8m1(tmp, 1), 16);
-    vuint32m2_t res  = vwaddu_vv_u32m2(vget_u16m2_u16m1(res_, 0), vget_u16m2_u16m1(res_, 1), 8);
-    return v_uint32x4(vget_u32m2_u32m1(res, 0));
+    vuint8m1_t tmp = vcnt_u8(vreinterpret_v_u32m1_u8m1(a.val));
+    vuint8m1_t seq = vid_v_u8m1(8);
+    vuint8m1_t index = vsll_vx_u8m1(seq, 1, 8);
+    vuint8m1_t sum = vadd_vv_u8m1(vrgather_vv_u8m1(tmp, index, 8), vrgather_vv_u8m1(tmp, vadd_vx_u8m1(index, 1, 8), 8), 8);
+    return v_uint32x4(vget_v_u32m4_u32m1(vwaddu_vx_u32m4(vwaddu_vv_u16m2(vrgather_vv_u8m1(sum, index, 4), vrgather_vv_u8m1(sum, vadd_vx_u8m1(index, 1, 4), 4), 4), 0, 4), 0));
 }
 
 inline v_uint32x4
 v_popcount(const v_int32x4& a)
 {
-    vuint8m2_t tmp = vundefined_u8m2();
-    tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val));
-    vuint64m2_t mask = (vuint64m2_t){0xFFFFFFFF0C080400, 0xFFFFFFFF0D090501,
-                     0xFFFFFFFF0E0A0602, 0xFFFFFFFF0F0B0703};
-    tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32);    \
-    vuint16m2_t res_ = vwaddu_vv_u16m2(vget_u8m2_u8m1(tmp, 0), vget_u8m2_u8m1(tmp, 1), 16);
-    vuint32m2_t res  = vwaddu_vv_u32m2(vget_u16m2_u16m1(res_, 0), vget_u16m2_u16m1(res_, 1), 8);
-    return v_uint32x4(vget_u32m2_u32m1(res, 0));
+    vuint8m1_t tmp = vcnt_u8(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i32m1_i8m1(a.val)));
+    vuint8m1_t seq = vid_v_u8m1(8);
+    vuint8m1_t index = vsll_vx_u8m1(seq, 1, 8);
+    vuint8m1_t sum = vadd_vv_u8m1(vrgather_vv_u8m1(tmp, index, 8), vrgather_vv_u8m1(tmp, vadd_vx_u8m1(index, 1, 8), 8), 8);
+    return v_uint32x4(vget_v_u32m4_u32m1(vwaddu_vx_u32m4(vwaddu_vv_u16m2(vrgather_vv_u8m1(sum, index, 4), vrgather_vv_u8m1(sum, vadd_vx_u8m1(index, 1, 4), 4), 4), 0, 4), 0));
 }
 
 inline v_uint64x2
 v_popcount(const v_uint64x2& a)
 {
-    vuint8m2_t tmp = vundefined_u8m2();
-    tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val));
-    vuint64m2_t mask = (vuint64m2_t){0x0706050403020100, 0x0000000000000000,
-                     0x0F0E0D0C0B0A0908, 0x0000000000000000};
-    tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32);    \
-    vuint8m1_t zero = vmv_v_x_u8m1(0, 16);
-    vuint8m1_t res1 = zero;
-    vuint8m1_t res2 = zero;
-    res1 = vredsum_vs_u8m1_u8m1(res1, vget_u8m2_u8m1(tmp, 0), zero, 8);
-    res2 = vredsum_vs_u8m1_u8m1(res2, vget_u8m2_u8m1(tmp, 1), zero, 8);
-
-    return v_uint64x2((unsigned long)vmv_x_s_u8m1_u8(res1, 8), (unsigned long)vmv_x_s_u8m1_u8(res2, 8));
+    vuint8m1_t tmp = vcnt_u8(vreinterpret_v_u64m1_u8m1(a.val));
+    vuint16m2_t tmp16 = vwaddu_vx_u16m2(tmp, 0, 16);
+    vuint16m1_t res1 = vundefined_u16m1();
+    vuint16m1_t res2 = vundefined_u16m1();
+    res1 = vredsum_vs_u16m1_u16m1(res1, vget_v_u16m2_u16m1(tmp16, 0), vmv_v_x_u16m1(0, 8), 8);
+    res2 = vredsum_vs_u16m1_u16m1(res2, vget_v_u16m2_u16m1(tmp16, 1), vmv_v_x_u16m1(0, 8), 8);
+    return v_uint64x2((unsigned long)vmv_x_s_u16m1_u16(res1), (unsigned long)vmv_x_s_u16m1_u16(res2));
 }
 
 inline v_uint64x2
 v_popcount(const v_int64x2& a)
 {
-    vuint8m2_t tmp = vundefined_u8m2();
-    tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val));
-    vuint64m2_t mask = (vuint64m2_t){0x0706050403020100, 0x0000000000000000,
-                     0x0F0E0D0C0B0A0908, 0x0000000000000000};
-    tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32);    \
-    vuint8m1_t zero = vmv_v_x_u8m1(0, 16);
-    vuint8m1_t res1 = zero;
-    vuint8m1_t res2 = zero;
-    res1 = vredsum_vs_u8m1_u8m1(res1, vget_u8m2_u8m1(tmp, 0), zero, 8);
-    res2 = vredsum_vs_u8m1_u8m1(res2, vget_u8m2_u8m1(tmp, 1), zero, 8);
-
-    return v_uint64x2((unsigned long)vmv_x_s_u8m1_u8(res1, 8), (unsigned long)vmv_x_s_u8m1_u8(res2, 8));
+    vuint8m1_t tmp = vcnt_u8(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i64m1_i8m1(a.val)));
+    vuint16m2_t tmp16 = vwaddu_vx_u16m2(tmp, 0, 16);
+    vuint16m1_t res1 = vundefined_u16m1(), res2 = vundefined_u16m1();
+    res1 = vredsum_vs_u16m1_u16m1(res1, vget_v_u16m2_u16m1(tmp16, 0), vmv_v_x_u16m1(0, 8), 8);
+    res2 = vredsum_vs_u16m1_u16m1(res2, vget_v_u16m2_u16m1(tmp16, 1), vmv_v_x_u16m1(0, 8), 8);
+    return v_uint64x2((unsigned long)vmv_x_s_u16m1_u16(res1), (unsigned long)vmv_x_s_u16m1_u16(res2));
 }
 
 #define SMASK 1, 2, 4, 8, 16, 32, 64, 128
 inline int v_signmask(const v_uint8x16& a)
 {
+    vuint16m1_t res = vundefined_u16m1();
+    vuint8m1_t id = vid_v_u8m1(16);
+    vuint16m2_t num = vsll_vv_u16m2(vmv_v_x_u16m2(1, 16), vwaddu_vx_u16m2(id, 0, 16), 16);
     vuint8m1_t t0  = vsrl_vx_u8m1(a.val, 7, 16);
-    vuint8m1_t m1  = (vuint8m1_t){SMASK, SMASK};
-    vuint16m2_t t1 = vwmulu_vv_u16m2(t0, m1, 16);
-    vuint32m1_t res = vmv_v_x_u32m1(0, 4);
-    vuint32m2_t t2 = vwmulu_vx_u32m2(vget_u16m2_u16m1(t1, 1), 256, 8);
-    res = vredsum_vs_u32m2_u32m1(res, t2, res, 8);
-    res = vwredsumu_vs_u16m1_u32m1(res, vget_u16m2_u16m1(t1, 0), res, 8);
-    return vmv_x_s_u32m1_u32(res, 8);
+    vbool8_t mask = vmseq_vx_u8m1_b8(t0, 1, 16);
+    res = vredsum_vs_u16m2_u16m1_m(mask, res, num, vmv_v_x_u16m1(0, 8), 16);
+    return vmv_x_s_u16m1_u16(res);
 }
 inline int v_signmask(const v_int8x16& a)
 {
-    vuint8m1_t t0 = vsrl_vx_u8m1((vuint8m1_t)a.val, 7, 16);
-    vuint8m1_t m1 = (vuint8m1_t){SMASK, SMASK};
-    vint16m2_t t1 = (vint16m2_t)vwmulu_vv_u16m2(t0, m1, 16);
-    vint32m1_t res = vmv_v_x_i32m1(0, 4);
-    vint32m2_t t2 = vwmul_vx_i32m2(vget_i16m2_i16m1(t1, 1), 256, 8);
-    res = vredsum_vs_i32m2_i32m1(res, t2, res, 8);
-    res = vwredsum_vs_i16m1_i32m1(res, vget_i16m2_i16m1(t1, 0), res, 8);
-    return vmv_x_s_i32m1_i32(res, 8);
+    vuint16m1_t res = vundefined_u16m1();
+    vuint8m1_t id = vid_v_u8m1(16);
+    vuint16m2_t num = vsll_vv_u16m2(vmv_v_x_u16m2(1, 16), vwaddu_vx_u16m2(id, 0, 16), 16);
+    vbool8_t mask = vmslt_vx_i8m1_b8(a.val, 0, 16);
+    res = vredsum_vs_u16m2_u16m1_m(mask, res, num, vmv_v_x_u16m1(0, 8), 16);
+    return vmv_x_s_u16m1_u16(res);
 }
 
 inline int v_signmask(const v_int16x8& a)
 {
-    vint16m1_t t0 = (vint16m1_t)vsrl_vx_u16m1((vuint16m1_t)a.val, 15, 8);
-    vint16m1_t m1 = (vint16m1_t){SMASK};
-    vint16m1_t t1 = vmul_vv_i16m1(t0, m1, 8);
-    vint16m1_t res = vmv_v_x_i16m1(0, 8);
-    res = vredsum_vs_i16m1_i16m1(res, t1, res, 8);
-    return vmv_x_s_i16m1_i16(res, 8);
+    vuint16m1_t res = vundefined_u16m1();
+    vuint16m1_t id = vid_v_u16m1(8);
+    vuint16m1_t num = vsll_vv_u16m1(vmv_v_x_u16m1(1, 8), id, 8);
+    vbool16_t mask = vmslt_vx_i16m1_b16(a.val, 0, 8);
+    res = vredsum_vs_u16m1_u16m1_m(mask, res, num, vmv_v_x_u16m1(0, 8), 16);
+    return vmv_x_s_u16m1_u16(res);
 }
 inline int v_signmask(const v_uint16x8& a)
 {
-    vint16m1_t t0 = (vint16m1_t)vsrl_vx_u16m1((vuint16m1_t)a.val, 15, 8);
-    vint16m1_t m1 = (vint16m1_t){SMASK};
-    vint16m1_t t1 = vmul_vv_i16m1(t0, m1, 8);
-    vint16m1_t res = vmv_v_x_i16m1(0, 8);
-    res = vredsum_vs_i16m1_i16m1(res, t1, res, 8);
-    return vmv_x_s_i16m1_i16(res, 8);
+    vuint16m1_t res = vundefined_u16m1();
+    vuint16m1_t id = vid_v_u16m1(8);
+    vuint16m1_t num = vsll_vv_u16m1(vmv_v_x_u16m1(1, 8), id, 8);
+    vuint16m1_t t0  = vsrl_vx_u16m1(a.val, 15, 8);
+    vbool16_t mask = vmseq_vx_u16m1_b16(t0, 1, 8);
+    res = vredsum_vs_u16m1_u16m1_m(mask, res, num, vmv_v_x_u16m1(0, 8), 8);
+    return vmv_x_s_u16m1_u16(res);
 }
 inline int v_signmask(const v_int32x4& a)
 {
-    vint32m1_t t0 = (vint32m1_t)vsrl_vx_u32m1((vuint32m1_t)a.val, 31, 4);
-    vint32m1_t m1 = (vint32m1_t){1, 2, 4, 8};
-    vint32m1_t res = vmv_v_x_i32m1(0, 4);
-    vint32m1_t t1 = vmul_vv_i32m1(t0, m1, 4);
-    res = vredsum_vs_i32m1_i32m1(res, t1, res, 4);
-    return vmv_x_s_i32m1_i32(res, 4);
+    vuint32m1_t res = vundefined_u32m1();
+    vuint32m1_t id = vid_v_u32m1(4);
+    vuint32m1_t num = vsll_vv_u32m1(vmv_v_x_u32m1(1, 4), id, 4);
+    vbool32_t mask = vmslt_vx_i32m1_b32(a.val, 0, 4);
+    res = vredsum_vs_u32m1_u32m1_m(mask, res, num, vmv_v_x_u32m1(0, 4), 4);
+    return vmv_x_s_u32m1_u32(res);
 }
 inline int v_signmask(const v_uint32x4& a)
 {
-    vint32m1_t t0 = (vint32m1_t)vsrl_vx_u32m1(a.val, 31, 4);
-    vint32m1_t m1 = (vint32m1_t){1, 2, 4, 8};
-    vint32m1_t res = vmv_v_x_i32m1(0, 4);
-    vint32m1_t t1 = vmul_vv_i32m1(t0, m1, 4);
-    res = vredsum_vs_i32m1_i32m1(res, t1, res, 4);
-    return vmv_x_s_i32m1_i32(res, 4);
+    vuint32m1_t res = vundefined_u32m1();
+    vuint32m1_t id = vid_v_u32m1(4);
+    vuint32m1_t num = vsll_vv_u32m1(vmv_v_x_u32m1(1, 4), id, 4);
+    vuint32m1_t t0  = vsrl_vx_u32m1(a.val, 31, 4);
+    vbool32_t mask = vmseq_vx_u32m1_b32(t0, 1, 4);
+    res = vredsum_vs_u32m1_u32m1_m(mask, res, num, vmv_v_x_u32m1(0, 4), 4);
+    return vmv_x_s_u32m1_u32(res);
 }
 inline int v_signmask(const v_uint64x2& a)
 {
-    vuint64m1_t v0 = vsrl_vx_u64m1(a.val, 63, 2);
-    int res = (int)vext_x_v_u64m1_u64(v0, 0, 2) + ((int)vext_x_v_u64m1_u64(v0, 1, 2) << 1);
-    return res;
+    vuint64m1_t res = vundefined_u64m1();
+    vuint64m1_t id = vid_v_u64m1(2);
+    vuint64m1_t num = vsll_vv_u64m1(vmv_v_x_u64m1(1, 2), id, 2);
+    vuint64m1_t t0  = vsrl_vx_u64m1(a.val, 63, 2);
+    vbool64_t mask = vmseq_vx_u64m1_b64(t0, 1, 2);
+    res = vredsum_vs_u64m1_u64m1_m(mask, res, num, vmv_v_x_u64m1(0, 2), 2);
+    return vmv_x_s_u64m1_u64(res);
 }
 inline int v_signmask(const v_int64x2& a)
 { return v_signmask(v_reinterpret_as_u64(a)); }
@@ -1860,12 +2235,14 @@ inline int v_signmask(const v_float64x2& a)
 { return v_signmask(v_reinterpret_as_u64(a)); }
 inline int v_signmask(const v_float32x4& a)
 {
-    vint32m1_t t0 = (vint32m1_t)vsrl_vx_u32m1((vuint32m1_t)a.val, 31, 4);
-    vint32m1_t m1 = (vint32m1_t){1, 2, 4, 8};
-    vint32m1_t res = vmv_v_x_i32m1(0, 4);
-    vint32m1_t t1 = vmul_vv_i32m1(t0, m1, 4);
-    res = vredsum_vs_i32m1_i32m1(res, t1, res, 4);
-    return vmv_x_s_i32m1_i32(res, 4);
+    return v_signmask(v_reinterpret_as_u32(a));
+    /*
+    vuint32m1_t res;
+    vuint32m1_t id = vid_v_u32m1(4);
+    vuint32m1_t num = vsll_vv_u32m1(vmv_v_x_u32m1(1, 4), id, 4);
+    vbool32_t mask = vmflt_vf_f32m1_b32(a.val, 0, 4);
+    res = vredsum_vs_u32m1_u32m1_m(mask, res, num, vmv_v_x_u32m1(0, 4), 4);
+    return vmv_x_s_u32m1_u32(res);*/
 }
 
 inline int v_scan_forward(const v_int8x16& a) {
@@ -1905,24 +2282,22 @@ int val = v_signmask(a);
 if(val==0) return 0;
 else return trailingZeros32(val); }
 
-#define OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(_Tpvec, suffix, _T, shift, num) \
+#define OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(_Tpvec, suffix, _T, shift, num, mask_b) \
 inline bool v_check_all(const v_##_Tpvec& a) \
 { \
     suffix##m1_t v0 = vsrl_vx_##_T(vnot_v_##_T(a.val, num), shift, num); \
-    vuint32m1_t v1 = vuint32m1_t(v0); \
-    return (v1[0] | v1[1] | v1[2] | v1[3]) == 0; \
+    return (vcpop_m_##mask_b(vmseq_vx_##_T##_##mask_b(v0, 1, num), num)) == 0; \
 } \
 inline bool v_check_any(const v_##_Tpvec& a) \
 { \
     suffix##m1_t v0 = vsrl_vx_##_T(a.val, shift, num); \
-    vuint32m1_t v1 = vuint32m1_t(v0); \
-    return (v1[0] | v1[1] | v1[2] | v1[3]) != 0; \
+    return (vcpop_m_##mask_b(vmseq_vx_##_T##_##mask_b(v0, 1, num), num)) != 0; \
 }
 
-OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint8x16, vuint8,  u8m1, 7, 16)
-OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint16x8, vuint16, u16m1, 15, 8)
-OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint32x4, vuint32, u32m1, 31, 4)
-OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint64x2, vuint64, u64m1, 63, 2)
+OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint8x16, vuint8,  u8m1, 7, 16, b8)
+OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint16x8, vuint16, u16m1, 15, 8, b16)
+OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint32x4, vuint32, u32m1, 31, 4, b32)
+OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint64x2, vuint64, u64m1, 63, 2, b64)
 
 inline bool v_check_all(const v_int8x16& a)
 { return v_check_all(v_reinterpret_as_u8(a)); }
@@ -1950,92 +2325,93 @@ inline bool v_check_any(const v_int64x2& a)
 inline bool v_check_any(const v_float64x2& a)
 { return v_check_any(v_reinterpret_as_u64(a)); }
 
-#define OPENCV_HAL_IMPL_RISCVV_SELECT(_Tpvec, suffix, _Tpvec2, num) \
+#define OPENCV_HAL_IMPL_RISCVV_SELECT(_Tpvec, suffix, _Tpvec2, num, mask_func) \
 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
 { \
-    return _Tpvec(vmerge_vvm_##suffix(_Tpvec2(mask.val), b.val, a.val, num)); \
+    return _Tpvec(vmerge_vvm_##suffix(mask_func(mask.val, 0, num), b.val, a.val, num)); \
 }
 
-OPENCV_HAL_IMPL_RISCVV_SELECT(v_int8x16,  i8m1, vbool8_t, 16)
-OPENCV_HAL_IMPL_RISCVV_SELECT(v_int16x8,  i16m1, vbool16_t, 8)
-OPENCV_HAL_IMPL_RISCVV_SELECT(v_int32x4,  i32m1, vbool32_t, 4)
-OPENCV_HAL_IMPL_RISCVV_SELECT(v_uint8x16, u8m1, vbool8_t, 16)
-OPENCV_HAL_IMPL_RISCVV_SELECT(v_uint16x8, u16m1, vbool16_t, 8)
-OPENCV_HAL_IMPL_RISCVV_SELECT(v_uint32x4, u32m1, vbool32_t, 4)
+OPENCV_HAL_IMPL_RISCVV_SELECT(v_int8x16,  i8m1, vbool8_t, 16, vmsne_vx_i8m1_b8)
+OPENCV_HAL_IMPL_RISCVV_SELECT(v_int16x8,  i16m1, vbool16_t, 8, vmsne_vx_i16m1_b16)
+OPENCV_HAL_IMPL_RISCVV_SELECT(v_int32x4,  i32m1, vbool32_t, 4, vmsne_vx_i32m1_b32)
+OPENCV_HAL_IMPL_RISCVV_SELECT(v_uint8x16, u8m1, vbool8_t, 16, vmsne_vx_u8m1_b8)
+OPENCV_HAL_IMPL_RISCVV_SELECT(v_uint16x8, u16m1, vbool16_t, 8, vmsne_vx_u16m1_b16)
+OPENCV_HAL_IMPL_RISCVV_SELECT(v_uint32x4, u32m1, vbool32_t, 4, vmsne_vx_u32m1_b32)
 inline v_float32x4 v_select(const v_float32x4& mask, const v_float32x4& a, const v_float32x4& b)
 {
-    return v_float32x4((vfloat32m1_t)vmerge_vvm_u32m1((vbool32_t)mask.val, (vuint32m1_t)b.val, (vuint32m1_t)a.val, 4));
+    return v_float32x4(vmerge_vvm_f32m1(vmfne_vf_f32m1_b32(mask.val, 0, 4), b.val, a.val, 4));
 }
 inline v_float64x2 v_select(const v_float64x2& mask, const v_float64x2& a, const v_float64x2& b)
 {
-    return v_float64x2((vfloat64m1_t)vmerge_vvm_u64m1((vbool64_t)mask.val, (vuint64m1_t)b.val, (vuint64m1_t)a.val, 2));
+    return v_float64x2(vmerge_vvm_f64m1(vmfne_vf_f64m1_b64(mask.val, 0, 2), b.val, a.val, 2));
 }
 
-#define OPENCV_HAL_IMPL_RISCVV_EXPAND(add, _Tpvec, _Tpwvec, _Tp, _Tp1, num1, _Tp2, num2, _T1, _T2) \
+#define OPENCV_HAL_IMPL_RISCVV_EXPAND(add, _Tpvec, _Tpwvec, _Tp, _Tp1, num1, _Tp2, num2, _T1, _T2, num3) \
 inline void v_expand(const _Tpvec& a, v_##_Tpwvec& b0, v_##_Tpwvec& b1) \
 { \
-    _T1##_t b = vw##add##_vv_##_Tp2##m2(a.val, vmv_v_x_##_Tp1(0, num1), num1);    \
-    b0.val = vget_##_Tp2##m2_##_Tp2##m1(b, 0);  \
-    b1.val = vget_##_Tp2##m2_##_Tp2##m1(b, 1);  \
+    _T1##_t b = vw##add##_vx_##_Tp2##m2(a.val, 0, num1);    \
+    b0.val = vget_v_##_Tp2##m2_##_Tp2##m1(b, 0);  \
+    b1.val = vget_v_##_Tp2##m2_##_Tp2##m1(b, 1);  \
 } \
 inline v_##_Tpwvec v_expand_low(const _Tpvec& a) \
 { \
-    _T1##_t b = vw##add##_vv_##_Tp2##m2(a.val, vmv_v_x_##_Tp1(0, num2), num2);    \
-    return v_##_Tpwvec(vget_##_Tp2##m2_##_Tp2##m1(b, 0)); \
+    _T1##_t b = vw##add##_vx_##_Tp2##m2(a.val, 0, num2);    \
+    return v_##_Tpwvec(vget_v_##_Tp2##m2_##_Tp2##m1(b, 0)); \
 } \
 inline v_##_Tpwvec v_expand_high(const _Tpvec& a) \
 { \
-    _T1##_t b = vw##add##_vv_##_Tp2##m2(a.val, vmv_v_x_##_Tp1(0, num1), num1);    \
-    return v_##_Tpwvec(vget_##_Tp2##m2_##_Tp2##m1(b, 1)); \
+    _T1##_t b = vw##add##_vx_##_Tp2##m2(a.val, 0, num1);    \
+    return v_##_Tpwvec(vget_v_##_Tp2##m2_##_Tp2##m1(b, 1)); \
 } \
 inline v_##_Tpwvec v_load_expand(const _Tp* ptr) \
 { \
-    _T2##_t val = vle##_v_##_Tp1(ptr, num2);    \
-    _T1##_t b = vw##add##_vv_##_Tp2##m2(val, vmv_v_x_##_Tp1(0, num2), num2);    \
-    return v_##_Tpwvec(vget_##_Tp2##m2_##_Tp2##m1(b, 0)); \
+    _T2##_t val = vle##num3##_v_##_Tp1(ptr, num2);    \
+    _T1##_t b = vw##add##_vx_##_Tp2##m2(val, 0, num2);    \
+    return v_##_Tpwvec(vget_v_##_Tp2##m2_##_Tp2##m1(b, 0)); \
 }
 
-OPENCV_HAL_IMPL_RISCVV_EXPAND(addu, v_uint8x16, uint16x8, uchar, u8m1, 16, u16, 8, vuint16m2, vuint8m1)
-OPENCV_HAL_IMPL_RISCVV_EXPAND(addu, v_uint16x8, uint32x4, ushort,  u16m1, 8, u32, 4, vuint32m2, vuint16m1)
-OPENCV_HAL_IMPL_RISCVV_EXPAND(addu, v_uint32x4, uint64x2, uint,  u32m1, 4, u64, 2, vuint64m2, vuint32m1)
-OPENCV_HAL_IMPL_RISCVV_EXPAND(add, v_int8x16, int16x8, schar,  i8m1, 16, i16, 8, vint16m2, vint8m1)
-OPENCV_HAL_IMPL_RISCVV_EXPAND(add, v_int16x8, int32x4, short,  i16m1, 8, i32, 4, vint32m2, vint16m1)
-OPENCV_HAL_IMPL_RISCVV_EXPAND(add, v_int32x4, int64x2, int,  i32m1, 4, i64, 2, vint64m2, vint32m1)
+OPENCV_HAL_IMPL_RISCVV_EXPAND(addu, v_uint8x16, uint16x8, uchar, u8m1, 16, u16, 8, vuint16m2, vuint8m1, 8)
+OPENCV_HAL_IMPL_RISCVV_EXPAND(addu, v_uint16x8, uint32x4, ushort,  u16m1, 8, u32, 4, vuint32m2, vuint16m1, 16)
+OPENCV_HAL_IMPL_RISCVV_EXPAND(addu, v_uint32x4, uint64x2, uint,  u32m1, 4, u64, 2, vuint64m2, vuint32m1, 32)
+OPENCV_HAL_IMPL_RISCVV_EXPAND(add, v_int8x16, int16x8, schar,  i8m1, 16, i16, 8, vint16m2, vint8m1, 8)
+OPENCV_HAL_IMPL_RISCVV_EXPAND(add, v_int16x8, int32x4, short,  i16m1, 8, i32, 4, vint32m2, vint16m1, 16)
+OPENCV_HAL_IMPL_RISCVV_EXPAND(add, v_int32x4, int64x2, int,  i32m1, 4, i64, 2, vint64m2, vint32m1, 32)
 
 inline v_uint32x4 v_load_expand_q(const uchar* ptr)
 {
     vuint16m2_t b = vundefined_u16m2();
     vuint32m2_t c = vundefined_u32m2();
-    vuint8m1_t val = vle_v_u8m1(ptr, 4);    \
+    vuint8m1_t val = vle8_v_u8m1(ptr, 4);    \
     b = vwaddu_vv_u16m2(val, vmv_v_x_u8m1(0, 4), 4);    \
-    c = vwaddu_vv_u32m2(vget_u16m2_u16m1(b, 0), vmv_v_x_u16m1(0, 4), 4);    \
-    return v_uint32x4(vget_u32m2_u32m1(c, 0));
+    c = vwaddu_vv_u32m2(vget_v_u16m2_u16m1(b, 0), vmv_v_x_u16m1(0, 4), 4);    \
+    return v_uint32x4(vget_v_u32m2_u32m1(c, 0));
 }
 
 inline v_int32x4 v_load_expand_q(const schar* ptr)
 {
     vint16m2_t b = vundefined_i16m2();
     vint32m2_t c = vundefined_i32m2();
-    vint8m1_t val = vle_v_i8m1(ptr, 4);    \
+    vint8m1_t val = vle8_v_i8m1(ptr, 4);    \
     b = vwadd_vv_i16m2(val, vmv_v_x_i8m1(0, 4), 4);    \
-    c = vwadd_vv_i32m2(vget_i16m2_i16m1(b, 0), vmv_v_x_i16m1(0, 4), 4);    \
-    return v_int32x4(vget_i32m2_i32m1(c, 0));
+    c = vwadd_vv_i32m2(vget_v_i16m2_i16m1(b, 0), vmv_v_x_i16m1(0, 4), 4);    \
+    return v_int32x4(vget_v_i32m2_i32m1(c, 0));
 }
-#define VITL_16 (vuint32m2_t){0x11011000, 0x13031202, 0x15051404, 0x17071606, 0x19091808, 0x1B0B1A0A, 0x1D0D1C0C, 0x1F0F1E0E}
-#define VITL_8 (vuint32m2_t){0x00080000, 0x00090001, 0x000A0002, 0x000B0003, 0x000C0004, 0x000D0005, 0x000E0006, 0x000F0007}
-#define VITL_4 (vuint32m2_t){0x00000000, 0x00000004, 0x00000001, 0x00000005, 0x00000002, 0x00000006, 0x00000003, 0x00000007}
-#define VITL_2 (vuint32m2_t){0, 0, 2, 0, 1, 0, 3, 0}
+#define VITL_16 {0x11011000, 0x13031202, 0x15051404, 0x17071606, 0x19091808, 0x1B0B1A0A, 0x1D0D1C0C, 0x1F0F1E0E}
+#define VITL_8 {0x00080000, 0x00090001, 0x000A0002, 0x000B0003, 0x000C0004, 0x000D0005, 0x000E0006, 0x000F0007}
+#define VITL_4 {0x00000000, 0x00000004, 0x00000001, 0x00000005, 0x00000002, 0x00000006, 0x00000003, 0x00000007}
+#define VITL_2 {0, 0, 2, 0, 1, 0, 3, 0}
 
-#define OPENCV_HAL_IMPL_RISCVV_UNPACKS(_Tpvec, _Tp, _T, _UTp, _UT, num, num2, len, numh) \
+#define OPENCV_HAL_IMPL_RISCVV_UNPACKS(_Tpvec, _Tp, _T, _UTp, _UT, num, num2, len, numh, refunc) \
 inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
 { \
     v##_Tp##m2_t tmp = vundefined_##_T##m2();\
-    tmp = vset_##_T##m2(tmp, 0, a0.val); \
-    tmp = vset_##_T##m2(tmp, 1, a1.val); \
-    vuint32m2_t mask = VITL_##num;    \
-    tmp = (v##_Tp##m2_t)vrgather_vv_##_T##m2((v##_Tp##m2_t)tmp, (v##_UTp##m2_t)mask, num2);    \
-    b0.val = vget_##_T##m2_##_T##m1(tmp, 0); \
-    b1.val = vget_##_T##m2_##_T##m1(tmp, 1); \
+    tmp = vset_v_##_T##m1_##_T##m2(tmp, 0, a0.val); \
+    tmp = vset_v_##_T##m1_##_T##m2(tmp, 1, a1.val); \
+    unsigned mdata[] = VITL_##num; \
+    vuint32m2_t mask = vle32_v_u32m2(mdata, 8);    \
+    tmp = (v##_Tp##m2_t)vrgather_vv_##_T##m2((v##_Tp##m2_t)tmp, refunc(mask), num2);    \
+    b0.val = vget_v_##_T##m2_##_T##m1(tmp, 0); \
+    b1.val = vget_v_##_T##m2_##_T##m1(tmp, 1); \
 } \
 inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
 { \
@@ -2044,58 +2420,59 @@ inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
 } \
 inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
 { \
-    v##_Tp##m1_t b0 = vslidedown_vx_##_T##m1(b.val, numh, num);    \
-    v##_Tp##m1_t a0 = vslidedown_vx_##_T##m1(a.val, numh, num);    \
-    v##_Tp##m1_t b1 = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a0, b0, numh, num);    \
+    v##_Tp##m1_t b0 = vundefined_##_T##m1(); \
+    v##_Tp##m1_t a0 = vundefined_##_T##m1(); \
+    v##_Tp##m1_t b1 = vundefined_##_T##m1(); \
+    b0 = vslidedown_vx_##_T##m1(b0, b.val, numh, num);    \
+    a0 = vslidedown_vx_##_T##m1(a0, a.val, numh, num);    \
+    b1 = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a0, b0, numh, num);    \
     return v_##_Tpvec(b1);\
 } \
 inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
 { \
+    v##_Tp##m1_t b0 = vundefined_##_T##m1(); \
+    v##_Tp##m1_t a0 = vundefined_##_T##m1(); \
     c.val = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a.val, b.val, numh, num);    \
-    v##_Tp##m1_t b0 = vslidedown_vx_##_T##m1(b.val, numh, num);    \
-    v##_Tp##m1_t a0 = vslidedown_vx_##_T##m1(a.val, numh, num);    \
+    b0 = vslidedown_vx_##_T##m1(b0, b.val, numh, num);    \
+    a0 = vslidedown_vx_##_T##m1(a0, a.val, numh, num);    \
     d.val = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a0, b0, numh, num);    \
 }
 
-OPENCV_HAL_IMPL_RISCVV_UNPACKS(uint8x16, uint8, u8, uint8, u8, 16, 32, b8, 8)
-OPENCV_HAL_IMPL_RISCVV_UNPACKS(int8x16, int8, i8, uint8, u8, 16, 32, b8, 8)
-OPENCV_HAL_IMPL_RISCVV_UNPACKS(uint16x8, uint16, u16, uint16, u16, 8, 16, b16, 4)
-OPENCV_HAL_IMPL_RISCVV_UNPACKS(int16x8, int16, i16, uint16, u16, 8, 16, b16, 4)
-OPENCV_HAL_IMPL_RISCVV_UNPACKS(uint32x4, uint32, u32, uint32, u32, 4, 8, b32, 2)
-OPENCV_HAL_IMPL_RISCVV_UNPACKS(int32x4, int32, i32, uint32, u32, 4, 8, b32, 2)
-OPENCV_HAL_IMPL_RISCVV_UNPACKS(float32x4, float32, f32, uint32, u32, 4, 8, b32, 2)
-OPENCV_HAL_IMPL_RISCVV_UNPACKS(float64x2, float64, f64, uint64, u64, 2, 4, b64, 1)
+OPENCV_HAL_IMPL_RISCVV_UNPACKS(uint8x16, uint8, u8, uint8, u8, 16, 32, b8, 8, vreinterpret_v_u32m2_u8m2)
+OPENCV_HAL_IMPL_RISCVV_UNPACKS(int8x16, int8, i8, uint8, u8, 16, 32, b8, 8, vreinterpret_v_u32m2_u8m2)
+OPENCV_HAL_IMPL_RISCVV_UNPACKS(uint16x8, uint16, u16, uint16, u16, 8, 16, b16, 4, vreinterpret_v_u32m2_u16m2)
+OPENCV_HAL_IMPL_RISCVV_UNPACKS(int16x8, int16, i16, uint16, u16, 8, 16, b16, 4, vreinterpret_v_u32m2_u16m2)
+OPENCV_HAL_IMPL_RISCVV_UNPACKS(uint32x4, uint32, u32, uint32, u32, 4, 8, b32, 2,)
+OPENCV_HAL_IMPL_RISCVV_UNPACKS(int32x4, int32, i32, uint32, u32, 4, 8, b32, 2,)
+OPENCV_HAL_IMPL_RISCVV_UNPACKS(float32x4, float32, f32, uint32, u32, 4, 8, b32, 2,)
+OPENCV_HAL_IMPL_RISCVV_UNPACKS(float64x2, float64, f64, uint64, u64, 2, 4, b64, 1, vreinterpret_v_u32m2_u64m2)
 
 inline v_uint8x16 v_reverse(const v_uint8x16 &a)
 {
-    vuint64m1_t mask = (vuint64m1_t){0x08090A0B0C0D0E0F, 0x0001020304050607};
-    return v_uint8x16(vrgather_vv_u8m1(a.val, (vuint8m1_t)mask, 16));
+    return v_uint8x16(vrgather_vv_u8m1(a.val, vrsub_vx_u8m1(vid_v_u8m1(16), 15, 16), 16));
 }
 inline v_int8x16 v_reverse(const v_int8x16 &a)
 {
-    vint64m1_t mask = (vint64m1_t){0x08090A0B0C0D0E0F, 0x0001020304050607};
-    return v_int8x16(vrgather_vv_i8m1(a.val, (vuint8m1_t)mask, 16));
+    return v_int8x16(vrgather_vv_i8m1(a.val, vrsub_vx_u8m1(vid_v_u8m1(16), 15, 16), 16));
 }
 
 inline v_uint16x8 v_reverse(const v_uint16x8 &a)
 {
-    vuint64m1_t mask = (vuint64m1_t){0x0004000500060007, 0x000000100020003};
-    return v_uint16x8(vrgather_vv_u16m1(a.val, (vuint16m1_t)mask, 8));
+    return v_uint16x8(vrgather_vv_u16m1(a.val, vrsub_vx_u16m1(vid_v_u16m1(8), 7, 8), 8));
 }
 
 inline v_int16x8 v_reverse(const v_int16x8 &a)
 {
-    vint64m1_t mask = (vint64m1_t){0x0004000500060007, 0x000000100020003};
-    return v_int16x8(vrgather_vv_i16m1(a.val, (vuint16m1_t)mask, 8));
+    return v_int16x8(vrgather_vv_i16m1(a.val, vrsub_vx_u16m1(vid_v_u16m1(8), 7, 8), 8));
 }
 inline v_uint32x4 v_reverse(const v_uint32x4 &a)
 {
-    return v_uint32x4(vrgather_vv_u32m1(a.val, (vuint32m1_t){3, 2, 1, 0}, 4));
+    return v_uint32x4(vrgather_vv_u32m1(a.val, vrsub_vx_u32m1(vid_v_u32m1(4), 3, 4), 4));
 }
 
 inline v_int32x4 v_reverse(const v_int32x4 &a)
 {
-    return v_int32x4(vrgather_vv_i32m1(a.val, (vuint32m1_t){3, 2, 1, 0}, 4));
+    return v_int32x4(vrgather_vv_i32m1(a.val, vrsub_vx_u32m1(vid_v_u32m1(4), 3, 4), 4));
 }
 
 inline v_float32x4 v_reverse(const v_float32x4 &a)
@@ -2103,17 +2480,17 @@ inline v_float32x4 v_reverse(const v_float32x4 &a)
 
 inline v_uint64x2 v_reverse(const v_uint64x2 &a)
 {
-    return v_uint64x2(a.val[1], a.val[0]);
+    return v_uint64x2(vrgather_vv_u64m1(a.val, vrsub_vx_u64m1(vid_v_u64m1(2), 1, 2), 2));
 }
 
 inline v_int64x2 v_reverse(const v_int64x2 &a)
 {
-    return v_int64x2(a.val[1], a.val[0]);
+    return v_int64x2(vrgather_vv_i64m1(a.val, vrsub_vx_u64m1(vid_v_u64m1(2), 1, 2), 2));
 }
 
 inline v_float64x2 v_reverse(const v_float64x2 &a)
 {
-    return v_float64x2(a.val[1], a.val[0]);
+    return v_float64x2(vrgather_vv_f64m1(a.val, vrsub_vx_u64m1(vid_v_u64m1(2), 1, 2), 2));
 }
 
 #define OPENCV_HAL_IMPL_RISCVV_EXTRACT(_Tpvec, suffix, size) \
@@ -2132,19 +2509,19 @@ OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_float32x4, f32, 2)
 OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_float64x2, f64, 3)
 
 
-#define OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(_Tpvec, _Tp, suffix) \
-template<int i> inline _Tp v_extract_n(_Tpvec v) { return v.val[i]; }
+#define OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(_Tpvec, _Tp, suffix, vtype, _vtype, num, mvfunc) \
+template<int i> inline _Tp v_extract_n(_Tpvec v) { vtype tmp = vundefined_##_vtype(); return mvfunc(vslidedown_vx_##_vtype(tmp, v.val, i, num)); }
 
-OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint8x16, uchar, u8)
-OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int8x16, schar, s8)
-OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint16x8, ushort, u16)
-OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int16x8, short, s16)
-OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint32x4, uint, u32)
-OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int32x4, int, s32)
-OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint64x2, uint64, u64)
-OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int64x2, int64, s64)
-OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_float32x4, float, f32)
-OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_float64x2, double, f64)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint8x16, uchar, u8, vuint8m1_t, u8m1, 16, vmv_x_s_u8m1_u8)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int8x16, schar, s8, vint8m1_t, i8m1, 16, vmv_x_s_i8m1_i8)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint16x8, ushort, u16, vuint16m1_t, u16m1, 8, vmv_x_s_u16m1_u16)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int16x8, short, s16, vint16m1_t, i16m1, 8, vmv_x_s_i16m1_i16)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint32x4, uint, u32, vuint32m1_t, u32m1, 4, vmv_x_s_u32m1_u32)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int32x4, int, s32, vint32m1_t, i32m1, 4, vmv_x_s_i32m1_i32)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint64x2, uint64, u64, vuint64m1_t, u64m1, 2, vmv_x_s_u64m1_u64)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int64x2, int64, s64, vint64m1_t, i64m1, 2, vmv_x_s_i64m1_i64)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_float32x4, float, f32, vfloat32m1_t, f32m1, 4, vfmv_f_s_f32m1_f32)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_float64x2, double, f64, vfloat64m1_t, f64m1, 2, vfmv_f_s_f64m1_f64)
 
 #define OPENCV_HAL_IMPL_RISCVV_BROADCAST(_Tpvec, _Tp, num) \
 template<int i> inline _Tpvec v_broadcast_element(_Tpvec v) { return _Tpvec(vrgather_vx_##_Tp##m1(v.val, i, num)); }
@@ -2158,10 +2535,24 @@ OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_int32x4, i32, 4)
 OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_uint64x2, u64, 2)
 OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_int64x2, i64, 2)
 OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_float32x4, f32, 4)
+
+inline void __builtin_riscv_fsrm(int val)
+{
+    asm("csrw frm, %0\n\t"
+        :
+        :"r"(val));
+    return;
+}
+
+inline void barrier1(void *arg) {
+  __asm__ __volatile__("" : : "r" (arg) : "memory");
+}
+
 inline v_int32x4 v_round(const v_float32x4& a)
 {
     __builtin_riscv_fsrm(0);
-    vint32m1_t nan = vand_vx_i32m1((vint32m1_t)a.val, 0x7f800000, 4);
+    vint32m1_t nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(a.val), 0x7f800000, 4);
+    barrier1(&nan);
     vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
     vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), a.val, 4);
     __builtin_riscv_fsrm(0);
@@ -2170,7 +2561,8 @@ inline v_int32x4 v_round(const v_float32x4& a)
 inline v_int32x4 v_floor(const v_float32x4& a)
 {
     __builtin_riscv_fsrm(2);
-    vint32m1_t nan = vand_vx_i32m1((vint32m1_t)a.val, 0x7f800000, 4);
+    vint32m1_t nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(a.val), 0x7f800000, 4);
+    barrier1(&nan);
     vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
     vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), a.val, 4);
     __builtin_riscv_fsrm(0);
@@ -2180,7 +2572,8 @@ inline v_int32x4 v_floor(const v_float32x4& a)
 inline v_int32x4 v_ceil(const v_float32x4& a)
 {
     __builtin_riscv_fsrm(3);
-    vint32m1_t nan = vand_vx_i32m1((vint32m1_t)a.val, 0x7f800000, 4);
+    vint32m1_t nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(a.val), 0x7f800000, 4);
+    barrier1(&nan);
     vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
     vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), a.val, 4);
     __builtin_riscv_fsrm(0);
@@ -2190,7 +2583,8 @@ inline v_int32x4 v_ceil(const v_float32x4& a)
 inline v_int32x4 v_trunc(const v_float32x4& a)
 {
     __builtin_riscv_fsrm(1);
-    vint32m1_t nan = vand_vx_i32m1((vint32m1_t)a.val, 0x7f800000, 4);
+    vint32m1_t nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(a.val), 0x7f800000, 4);
+    barrier1(&nan);
     vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
     vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), a.val, 4);
     __builtin_riscv_fsrm(0);
@@ -2201,10 +2595,11 @@ inline v_int32x4 v_round(const v_float64x2& a)
 {
     __builtin_riscv_fsrm(0);
     vfloat64m2_t _val = vundefined_f64m2();
-    _val = vset_f64m2(_val, 0, a.val);
+    _val = vset_v_f64m1_f64m2(_val, 0, a.val);
     //_val = vset_f64m2(_val, 1, a.val);
-    _val = vset_f64m2(_val, 1, vfmv_v_f_f64m1(0, 2));
-    vint32m1_t val = vfncvt_x_f_v_i32m1(_val, 4);
+    _val = vset_v_f64m1_f64m2(_val, 1, vfmv_v_f_f64m1(0, 2));
+    barrier1(&_val);
+    vint32m1_t val = vfncvt_x_f_w_i32m1(_val, 4);
     __builtin_riscv_fsrm(0);
     return v_int32x4(val);
 }
@@ -2212,9 +2607,10 @@ inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
 {
     __builtin_riscv_fsrm(0);
     vfloat64m2_t _val = vundefined_f64m2();
-    _val = vset_f64m2(_val, 0, a.val);
-    _val = vset_f64m2(_val, 1, b.val);
-    vint32m1_t val = vfncvt_x_f_v_i32m1(_val, 4);
+    _val = vset_v_f64m1_f64m2(_val, 0, a.val);
+    _val = vset_v_f64m1_f64m2(_val, 1, b.val);
+    barrier1(&_val);
+    vint32m1_t val = vfncvt_x_f_w_i32m1(_val, 4);
     __builtin_riscv_fsrm(0);
     return v_int32x4(val);
 }
@@ -2222,10 +2618,10 @@ inline v_int32x4 v_floor(const v_float64x2& a)
 {
     __builtin_riscv_fsrm(2);
     vfloat64m2_t _val = vundefined_f64m2();
-    _val = vset_f64m2(_val, 0, a.val);
-    vfloat32m1_t aval = vfncvt_f_f_v_f32m1(_val, 2);
-
-    vint32m1_t nan = vand_vx_i32m1((vint32m1_t)aval, 0x7f800000, 4);
+    _val = vset_v_f64m1_f64m2(_val, 0, a.val);
+    vfloat32m1_t aval = vfncvt_f_f_w_f32m1(_val, 2);
+    vint32m1_t nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(aval), 0x7f800000, 4);
+    barrier1(&nan);
     vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
     vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), aval, 4);
     __builtin_riscv_fsrm(0);
@@ -2236,10 +2632,10 @@ inline v_int32x4 v_ceil(const v_float64x2& a)
 {
     __builtin_riscv_fsrm(3);
     vfloat64m2_t _val = vundefined_f64m2();
-    _val = vset_f64m2(_val, 0, a.val);
-    vfloat32m1_t aval = vfncvt_f_f_v_f32m1(_val, 2);
-
-    vint32m1_t nan = vand_vx_i32m1((vint32m1_t)aval, 0x7f800000, 4);
+    _val = vset_v_f64m1_f64m2(_val, 0, a.val);
+    vfloat32m1_t aval = vfncvt_f_f_w_f32m1(_val, 2);
+    vint32m1_t nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(aval), 0x7f800000, 4);
+    barrier1(&nan);
     vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
     vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), aval, 4);
     __builtin_riscv_fsrm(0);
@@ -2250,139 +2646,86 @@ inline v_int32x4 v_trunc(const v_float64x2& a)
 {
     __builtin_riscv_fsrm(1);
     vfloat64m2_t _val = vundefined_f64m2();
-    _val = vset_f64m2(_val, 0, a.val);
-    vfloat32m1_t aval = vfncvt_f_f_v_f32m1(_val, 2);
-
-    vint32m1_t nan = vand_vx_i32m1((vint32m1_t)aval, 0x7f800000, 4);
+    _val = vset_v_f64m1_f64m2(_val, 0, a.val);
+    vfloat32m1_t aval = vfncvt_f_f_w_f32m1(_val, 2);
+    vint32m1_t nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(aval), 0x7f800000, 4);
+    barrier1(&nan);
     vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
     vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), aval, 4);
     __builtin_riscv_fsrm(0);
     return v_int32x4(val);
 }
 
-#define OPENCV_HAL_IMPL_RISCVV_LOAD_DEINTERLEAVED(intrin, _Tpvec, num, _Tp, _T)    \
+#define OPENCV_HAL_IMPL_RISCVV_LOAD_DEINTERLEAVED(intrin, _Tpvec, num, _Tp, _T, elemsize)    \
 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b) \
 { \
-    v##_Tpvec##m1x2_t ret = intrin##2e_v_##_T##m1x2(ptr, num);\
-    a.val = vget_##_T##m1x2_##_T##m1(ret, 0);  \
-    b.val = vget_##_T##m1x2_##_T##m1(ret, 1);  \
+    intrin##2e##elemsize##_v_##_T##m1(&a.val, &b.val, ptr, num); \
 } \
 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, v_##_Tpvec##x##num& c) \
 { \
-    v##_Tpvec##m1x3_t ret = intrin##3e_v_##_T##m1x3(ptr, num);\
-    a.val = vget_##_T##m1x3_##_T##m1(ret, 0);  \
-    b.val = vget_##_T##m1x3_##_T##m1(ret, 1);  \
-    c.val = vget_##_T##m1x3_##_T##m1(ret, 2);  \
+    intrin##3e##elemsize##_v_##_T##m1(&a.val, &b.val, &c.val, ptr, num); \
 }\
 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, \
                                 v_##_Tpvec##x##num& c, v_##_Tpvec##x##num& d) \
 { \
-    v##_Tpvec##m1x4_t ret = intrin##4e_v_##_T##m1x4(ptr, num);\
-    a.val = vget_##_T##m1x4_##_T##m1(ret, 0);  \
-    b.val = vget_##_T##m1x4_##_T##m1(ret, 1);  \
-    c.val = vget_##_T##m1x4_##_T##m1(ret, 2);  \
-    d.val = vget_##_T##m1x4_##_T##m1(ret, 3);  \
+    intrin##4e##elemsize##_v_##_T##m1(&a.val, &b.val, &c.val, &d.val, ptr, num); \
 } \
 
-#define OPENCV_HAL_IMPL_RISCVV_STORE_INTERLEAVED(intrin, _Tpvec, num, _Tp, _T)    \
+#define OPENCV_HAL_IMPL_RISCVV_STORE_INTERLEAVED(intrin, _Tpvec, num, _Tp, _T, elemsize)    \
 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
                                 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
 { \
-    v##_Tpvec##m1x2_t ret = vundefined_##_T##m1x2();      \
-    ret = vset_##_T##m1x2(ret, 0, a.val);  \
-    ret = vset_##_T##m1x2(ret, 1, b.val);  \
-    intrin##2e_v_##_T##m1x2(ptr, ret, num); \
+    intrin##2e##elemsize##_v_##_T##m1(ptr, a.val, b.val, num); \
 } \
 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
                                 const v_##_Tpvec##x##num& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
 { \
-    v##_Tpvec##m1x3_t ret = vundefined_##_T##m1x3();       \
-    ret = vset_##_T##m1x3(ret, 0, a.val);  \
-    ret = vset_##_T##m1x3(ret, 1, b.val);  \
-    ret = vset_##_T##m1x3(ret, 2, c.val);  \
-    intrin##3e_v_##_T##m1x3(ptr, ret, num); \
+    intrin##3e##elemsize##_v_##_T##m1(ptr, a.val, b.val, c.val, num); \
 } \
 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
                                 const v_##_Tpvec##x##num& c, const v_##_Tpvec##x##num& d, \
                                 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
 { \
-    v##_Tpvec##m1x4_t ret = vundefined_##_T##m1x4();             \
-    ret = vset_##_T##m1x4(ret, 0, a.val);  \
-    ret = vset_##_T##m1x4(ret, 1, b.val);  \
-    ret = vset_##_T##m1x4(ret, 2, c.val);  \
-    ret = vset_##_T##m1x4(ret, 3, d.val);  \
-    intrin##4e_v_##_T##m1x4(ptr, ret, num); \
+    intrin##4e##elemsize##_v_##_T##m1(ptr, a.val, b.val, c.val, d.val, num); \
 }
 
-#define OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(_Tpvec, _Tp, num, ld, st, _T) \
-OPENCV_HAL_IMPL_RISCVV_LOAD_DEINTERLEAVED(ld, _Tpvec, num, _Tp, _T)    \
-OPENCV_HAL_IMPL_RISCVV_STORE_INTERLEAVED(st, _Tpvec, num, _Tp, _T)
+#define OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(_Tpvec, _Tp, num, ld, st, _T, elemsize) \
+OPENCV_HAL_IMPL_RISCVV_LOAD_DEINTERLEAVED(ld, _Tpvec, num, _Tp, _T, elemsize)    \
+OPENCV_HAL_IMPL_RISCVV_STORE_INTERLEAVED(st, _Tpvec, num, _Tp, _T, elemsize)
 
 //OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint8, uchar, )
-OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(int8, schar, 16, vlseg, vsseg, i8)
-OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(int16, short, 8, vlseg, vsseg, i16)
-OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(int32, int, 4, vlseg, vsseg, i32)
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(int8, schar, 16, vlseg, vsseg, i8, 8)
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(int16, short, 8, vlseg, vsseg, i16, 16)
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(int32, int, 4, vlseg, vsseg, i32, 32)
 
-OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint8, unsigned char, 16, vlseg, vsseg, u8)
-OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint16, unsigned short, 8, vlseg, vsseg, u16)
-OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint32, unsigned int, 4, vlseg, vsseg, u32)
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint8, unsigned char, 16, vlseg, vsseg, u8, 8)
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint16, unsigned short, 8, vlseg, vsseg, u16, 16)
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint32, unsigned int, 4, vlseg, vsseg, u32, 32)
 
-#define OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(_Tpvec, _Tp, num, _T) \
+#define OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(_Tpvec, _Tp, num, _T, _esize) \
 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b) \
-{ \
-    v##_Tpvec##m1x2_t ret = vlseg2e_v_##_T##m1x2(ptr, num); \
-    a.val = vget_##_T##m1x2_##_T##m1(ret, 0);  \
-    b.val = vget_##_T##m1x2_##_T##m1(ret, 1);  \
-} \
+{ vlseg2e##_esize##_v_##_T##m1(&a.val, &b.val, ptr, num);} \
 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, v_##_Tpvec##x##num& c) \
-{ \
-    v##_Tpvec##m1x3_t ret = vlseg3e_v_##_T##m1x3(ptr, num);    \
-    a.val = vget_##_T##m1x3_##_T##m1(ret, 0);  \
-    b.val = vget_##_T##m1x3_##_T##m1(ret, 1);  \
-    c.val = vget_##_T##m1x3_##_T##m1(ret, 2);  \
-}\
+{ vlseg3e##_esize##_v_##_T##m1(&a.val, &b.val, &c.val, ptr, num);}\
 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, \
                                 v_##_Tpvec##x##num& c, v_##_Tpvec##x##num& d) \
-{ \
-    v##_Tpvec##m1x4_t ret = vlseg4e_v_##_T##m1x4(ptr, num);    \
-    a.val = vget_##_T##m1x4_##_T##m1(ret, 0);  \
-    b.val = vget_##_T##m1x4_##_T##m1(ret, 1);  \
-    c.val = vget_##_T##m1x4_##_T##m1(ret, 2);  \
-    d.val = vget_##_T##m1x4_##_T##m1(ret, 3);  \
-} \
+{ vlseg4e##_esize##_v_##_T##m1(&a.val, &b.val, &c.val, &d.val, ptr, num);} \
 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
                                 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
-{ \
-    v##_Tpvec##m1x2_t ret = vundefined_##_T##m1x2();    \
-    ret = vset_##_T##m1x2(ret, 0, a.val);  \
-    ret = vset_##_T##m1x2(ret, 1, b.val);  \
-    vsseg2e_v_##_T##m1x2(ptr, ret, num);    \
-} \
+{ vsseg2e##_esize##_v_##_T##m1(ptr, a.val, b.val, num);} \
 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
                                 const v_##_Tpvec##x##num& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
-{ \
-    v##_Tpvec##m1x3_t ret = vundefined_##_T##m1x3();    \
-    ret = vset_##_T##m1x3(ret, 0, a.val);  \
-    ret = vset_##_T##m1x3(ret, 1, b.val);  \
-    ret = vset_##_T##m1x3(ret, 2, c.val);  \
-    vsseg3e_v_##_T##m1x3(ptr, ret, num);    \
-} \
+{ vsseg3e##_esize##_v_##_T##m1(ptr, a.val, b.val, c.val, num);} \
 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
                                 const v_##_Tpvec##x##num& c, const v_##_Tpvec##x##num& d, \
                                 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
-{ \
-    v##_Tpvec##m1x4_t ret = vundefined_##_T##m1x4();    \
-    ret = vset_##_T##m1x4(ret, 0, a.val);  \
-    ret = vset_##_T##m1x4(ret, 1, b.val);  \
-    ret = vset_##_T##m1x4(ret, 2, c.val);  \
-    ret = vset_##_T##m1x4(ret, 3, d.val);  \
-    vsseg4e_v_##_T##m1x4(ptr, ret, num);    \
-}
-OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(float32, float, 4, f32)
-OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(float64, double, 2, f64)
+{ vsseg4e##_esize##_v_##_T##m1(ptr, a.val, b.val, c.val, d.val, num);}
 
-OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(uint64, unsigned long, 2, u64)
-OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(int64, long, 2, i64)
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(float32, float, 4, f32, 32)
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(float64, double, 2, f64, 64)
+
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(uint64, unsigned long, 2, u64, 64)
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(int64, long, 2, i64, 64)
 
 inline v_float32x4 v_cvt_f32(const v_int32x4& a)
 {
@@ -2393,17 +2736,17 @@ inline v_float32x4 v_cvt_f32(const v_int32x4& a)
 inline v_float32x4 v_cvt_f32(const v_float64x2& a)
 {
     vfloat64m2_t _val = vundefined_f64m2();
-    _val = vset_f64m2(_val, 0, a.val);
-    vfloat32m1_t aval = vfncvt_f_f_v_f32m1(_val, 2);
+    _val = vset_v_f64m1_f64m2(_val, 0, a.val);
+    vfloat32m1_t aval = vfncvt_f_f_w_f32m1(_val, 2);
     return v_float32x4(aval);
 }
 
 inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
 {
     vfloat64m2_t _val = vundefined_f64m2();
-    _val = vset_f64m2(_val, 0, a.val);
-    _val = vset_f64m2(_val, 1, b.val);
-    vfloat32m1_t aval = vfncvt_f_f_v_f32m1(_val, 4);
+    _val = vset_v_f64m1_f64m2(_val, 0, a.val);
+    _val = vset_v_f64m1_f64m2(_val, 1, b.val);
+    vfloat32m1_t aval = vfncvt_f_f_w_f32m1(_val, 4);
     return v_float32x4(aval);
 }
 
@@ -2411,26 +2754,26 @@ inline v_float64x2 v_cvt_f64(const v_int32x4& a)
 {
     vfloat32m1_t val = vfcvt_f_x_v_f32m1(a.val, 4);
     vfloat64m2_t _val = vfwcvt_f_f_v_f64m2(val, 4);
-    return v_float64x2(vget_f64m2_f64m1(_val, 0));
+    return v_float64x2(vget_v_f64m2_f64m1(_val, 0));
 }
 
 inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
 {
     vfloat32m1_t val = vfcvt_f_x_v_f32m1(a.val, 4);
     vfloat64m2_t _val = vfwcvt_f_f_v_f64m2(val, 4);
-    return v_float64x2(vget_f64m2_f64m1(_val, 1));
+    return v_float64x2(vget_v_f64m2_f64m1(_val, 1));
 }
 
 inline v_float64x2 v_cvt_f64(const v_float32x4& a)
 {
     vfloat64m2_t _val  = vfwcvt_f_f_v_f64m2(a.val, 4);
-    return v_float64x2(vget_f64m2_f64m1(_val, 0));
+    return v_float64x2(vget_v_f64m2_f64m1(_val, 0));
 }
 
 inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
 {
     vfloat64m2_t _val  = vfwcvt_f_f_v_f64m2(a.val, 4);
-    return v_float64x2(vget_f64m2_f64m1(_val, 1));
+    return v_float64x2(vget_v_f64m2_f64m1(_val, 1));
 }
 
 inline v_float64x2 v_cvt_f64(const v_int64x2& a)
@@ -2441,8 +2784,9 @@ inline v_float64x2 v_cvt_f64(const v_int64x2& a)
 #endif
 inline v_int8x16 v_interleave_pairs(const v_int8x16& vec)
 {
-    vuint64m1_t m0 = {0x0705060403010200, 0x0F0D0E0C0B090A08};
-    return v_int8x16(vrgather_vv_i8m1(vec.val, (vuint8m1_t)m0, 16));
+    uint64 mdata[2] = {0x0705060403010200, 0x0F0D0E0C0B090A08};
+    vuint64m1_t m0 = vle64_v_u64m1(mdata, 2);
+    return v_int8x16(vrgather_vv_i8m1(vec.val, vreinterpret_v_u64m1_u8m1(m0), 16));
 }
 inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec)
 {
@@ -2451,8 +2795,9 @@ inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec)
 
 inline v_int8x16 v_interleave_quads(const v_int8x16& vec)
 {
-    vuint64m1_t m0 = {0x0703060205010400, 0x0F0B0E0A0D090C08};
-    return v_int8x16(vrgather_vv_i8m1(vec.val, (vuint8m1_t)m0, 16));
+    uint64 mdata[2] = {0x0703060205010400, 0x0F0B0E0A0D090C08};
+    vuint64m1_t m0 = vle64_v_u64m1(mdata, 2);
+    return v_int8x16(vrgather_vv_i8m1(vec.val, vreinterpret_v_u64m1_u8m1(m0), 16));
 }
 inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec)
 {
@@ -2461,35 +2806,40 @@ inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec)
 
 inline v_int16x8 v_interleave_pairs(const v_int16x8& vec)
 {
-    vuint64m1_t m0 = {0x0706030205040100, 0x0F0E0B0A0D0C0908};
-    return v_int16x8((vint16m1_t)vrgather_vv_u8m1((vuint8m1_t)vec.val, (vuint8m1_t)m0, 16));
+    uint64 mdata[2] = {0x0706030205040100, 0x0F0E0B0A0D0C0908};
+    vuint64m1_t m0 = vle64_v_u64m1(mdata, 2);
+    return v_int16x8(vreinterpret_v_i8m1_i16m1(vreinterpret_v_u8m1_i8m1(vrgather_vv_u8m1(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i16m1_i8m1(vec.val)), vreinterpret_v_u64m1_u8m1(m0), 16))));
 }
 inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
 inline v_int16x8 v_interleave_quads(const v_int16x8& vec)
 {
-    vuint64m1_t m0 = {0x0B0A030209080100, 0x0F0E07060D0C0504};
-    return v_int16x8((vint16m1_t)vrgather_vv_u8m1((vuint8m1_t)(vec.val), (vuint8m1_t)m0, 16));
+    uint64 mdata[2] = {0x0B0A030209080100, 0x0F0E07060D0C0504};
+    vuint64m1_t m0 = vle64_v_u64m1(mdata, 2);
+    return v_int16x8(vreinterpret_v_i8m1_i16m1(vreinterpret_v_u8m1_i8m1(vrgather_vv_u8m1(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i16m1_i8m1(vec.val)), vreinterpret_v_u64m1_u8m1(m0), 16))));
 }
 inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
 
 inline v_int32x4 v_interleave_pairs(const v_int32x4& vec)
 {
-    vuint64m1_t m0 = {0x0B0A090803020100, 0x0F0E0D0C07060504};
-    return v_int32x4((vint32m1_t)vrgather_vv_u8m1((vuint8m1_t)(vec.val), (vuint8m1_t)m0, 16));
+    uint64 mdata[2] = {0x0B0A090803020100, 0x0F0E0D0C07060504};
+    vuint64m1_t m0 = vle64_v_u64m1(mdata, 2);
+    return v_int32x4(vreinterpret_v_i8m1_i32m1(vreinterpret_v_u8m1_i8m1(vrgather_vv_u8m1(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i32m1_i8m1(vec.val)), vreinterpret_v_u64m1_u8m1(m0), 16))));
 }
 inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
 inline v_float32x4 v_interleave_pairs(const v_float32x4& vec) { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
 inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
 {
-    vuint64m1_t m0 = {0x0908060504020100, 0xFFFFFFFF0E0D0C0A};
-    return v_int8x16((vint8m1_t)vrgather_vv_u8m1((vuint8m1_t)(vec.val), (vuint8m1_t)m0, 16));
+    uint64 mdata[2] = {0x0908060504020100, 0xFFFFFFFF0E0D0C0A};
+    vuint64m1_t m0 = vle64_v_u64m1(mdata, 2);
+    return v_int8x16(vreinterpret_v_u8m1_i8m1(vrgather_vv_u8m1(vreinterpret_v_i8m1_u8m1(vec.val), vreinterpret_v_u64m1_u8m1(m0), 16)));
 }
 inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
 
 inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
 {
-    vuint64m1_t m0 = {0x0908050403020100, 0xFFFFFFFF0D0C0B0A};
-    return v_int16x8((vint16m1_t)vrgather_vv_u8m1((vuint8m1_t)(vec.val), (vuint8m1_t)m0, 16));
+    uint64 mdata[2] = {0x0908050403020100, 0xFFFFFFFF0D0C0B0A};
+    vuint64m1_t m0 = vle64_v_u64m1(mdata, 2);
+    return v_int16x8(vreinterpret_v_i8m1_i16m1(vreinterpret_v_u8m1_i8m1(vrgather_vv_u8m1(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i16m1_i8m1(vec.val)), vreinterpret_v_u64m1_u8m1(m0), 16))));
 }
 inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
 
@@ -2506,7 +2856,7 @@ inline v_float64x2 v_dotprod_expand(const v_int32x4& a,   const v_int32x4& b,
 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
 {
     vint64m2_t v1 = vwmul_vv_i64m2(a.val, b.val, 4);
-    vfloat64m1_t res = vfcvt_f_x_v_f64m1(vadd_vv_i64m1(vget_i64m2_i64m1(v1, 0), vget_i64m2_i64m1(v1, 1), 2), 2);
+    vfloat64m1_t res = vfcvt_f_x_v_f64m1(vadd_vv_i64m1(vget_v_i64m2_i64m1(v1, 0), vget_v_i64m2_i64m1(v1, 1), 2), 2);
     return v_float64x2(res);
 }
 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
@@ -2514,21 +2864,37 @@ inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b,
   return res + c; }
 #endif
 ////// FP16 support ///////
+#if __riscv_v == 7000
 inline v_float32x4 v_load_expand(const float16_t* ptr)
 {
-    vfloat16m1_t v = vle_v_f16m1((__fp16*)ptr, 4);
+    vfloat16m1_t v = vle16_v_f16m1((__fp16*)ptr, 4);
     vfloat32m2_t v32 = vfwcvt_f_f_v_f32m2(v, 4);
-    return v_float32x4(vget_f32m2_f32m1(v32, 0));
+    return v_float32x4(vget_v_f32m2_f32m1(v32, 0));
 }
 
 inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
 {
     vfloat32m2_t v32 = vundefined_f32m2();
-    v32 = vset_f32m2(v32, 0, v.val);
-    vfloat16m1_t hv = vfncvt_f_f_v_f16m1(v32, 4);
-    vse_v_f16m1((__fp16*)ptr, hv, 4);
+    v32 = vset_v_f32m1_f32m2(v32, 0, v.val);
+    vfloat16m1_t hv = vfncvt_f_f_w_f16m1(v32, 4);
+    vse16_v_f16m1((__fp16*)ptr, hv, 4);
+}
+#else
+inline v_float32x4 v_load_expand(const float16_t* ptr)
+{
+    vfloat16mf2_t v = vle16_v_f16mf2((__fp16*)ptr, 4);
+    vfloat32m1_t v32 = vfwcvt_f_f_v_f32m1(v, 4);
+    return v_float32x4(v32);
 }
 
+inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
+{
+    //vfloat32m2_t v32 = vundefined_f32m2();
+    //v32 = vset_f32m2(v32, 0, v.val);
+    vfloat16mf2_t hv = vfncvt_f_f_w_f16mf2(v.val, 4);
+    vse16_v_f16mf2((__fp16*)ptr, hv, 4);
+}
+#endif
 
 inline void v_cleanup() {}
 
@@ -2536,5 +2902,5 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
 
 //! @endcond
 
-} // namespace cv
+}
 #endif
diff --git a/modules/core/include/opencv2/core/mat.hpp b/modules/core/include/opencv2/core/mat.hpp
index a33c3d295e..4cec7c0087 100644
--- a/modules/core/include/opencv2/core/mat.hpp
+++ b/modules/core/include/opencv2/core/mat.hpp
@@ -53,6 +53,7 @@
 
 #include "opencv2/core/bufferpool.hpp"
 
+#include <array>
 #include <type_traits>
 
 namespace cv
diff --git a/modules/core/include/opencv2/core/matx.hpp b/modules/core/include/opencv2/core/matx.hpp
index 8d5c2e49f8..5fcc821da4 100644
--- a/modules/core/include/opencv2/core/matx.hpp
+++ b/modules/core/include/opencv2/core/matx.hpp
@@ -386,10 +386,8 @@ public:
     static Vec randn(_Tp a, _Tp b);
     static Vec randu(_Tp a, _Tp b);
     static Vec zeros();
-#ifdef CV_CXX11
     static Vec diag(_Tp alpha) = delete;
     static Vec eye() = delete;
-#endif
 
     //! per-element multiplication
     Vec mul(const Vec<_Tp, cn>& v) const;
@@ -412,9 +410,7 @@ public:
     const _Tp& operator ()(int i) const;
     _Tp& operator ()(int i);
 
-#ifdef CV_CXX11
     Vec<_Tp, cn>& operator=(const Vec<_Tp, cn>& rhs) = default;
-#endif
 
     Vec(const Matx<_Tp, cn, 1>& a, const Matx<_Tp, cn, 1>& b, Matx_AddOp);
     Vec(const Matx<_Tp, cn, 1>& a, const Matx<_Tp, cn, 1>& b, Matx_SubOp);
diff --git a/modules/core/include/opencv2/core/quaternion.inl.hpp b/modules/core/include/opencv2/core/quaternion.inl.hpp
index b901ecbc68..4204806a82 100644
--- a/modules/core/include/opencv2/core/quaternion.inl.hpp
+++ b/modules/core/include/opencv2/core/quaternion.inl.hpp
@@ -28,7 +28,7 @@
 #define OPENCV_CORE_QUATERNION_INL_HPP
 
 #ifndef OPENCV_CORE_QUATERNION_HPP
-#erorr This is not a standalone header. Include quaternion.hpp instead.
+#error This is not a standalone header. Include quaternion.hpp instead.
 #endif
 
 //@cond IGNORE
diff --git a/modules/core/include/opencv2/core/utils/allocator_stats.impl.hpp b/modules/core/include/opencv2/core/utils/allocator_stats.impl.hpp
index eb5ecde16b..bbc6cf8979 100644
--- a/modules/core/include/opencv2/core/utils/allocator_stats.impl.hpp
+++ b/modules/core/include/opencv2/core/utils/allocator_stats.impl.hpp
@@ -9,8 +9,6 @@
 
 //#define OPENCV_DISABLE_ALLOCATOR_STATS
 
-#ifdef CV_CXX11
-
 #include <atomic>
 
 #ifndef OPENCV_ALLOCATOR_STATS_COUNTER_TYPE
@@ -26,14 +24,6 @@
 #define OPENCV_ALLOCATOR_STATS_COUNTER_TYPE long long
 #endif
 
-#else  // CV_CXX11
-
-#ifndef OPENCV_ALLOCATOR_STATS_COUNTER_TYPE
-#define OPENCV_ALLOCATOR_STATS_COUNTER_TYPE int  // CV_XADD supports int only
-#endif
-
-#endif  // CV_CXX11
-
 namespace cv { namespace utils {
 
 #ifdef CV__ALLOCATOR_STATS_LOG
@@ -59,7 +49,7 @@ public:
     void onAllocate(size_t /*sz*/) {}
     void onFree(size_t /*sz*/) {}
 
-#elif defined(CV_CXX11)
+#else
 
 protected:
     typedef OPENCV_ALLOCATOR_STATS_COUNTER_TYPE counter_t;
@@ -104,49 +94,7 @@ public:
 #endif
         curr -= (counter_t)sz;
     }
-
-#else  // non C++11
-
-protected:
-    typedef OPENCV_ALLOCATOR_STATS_COUNTER_TYPE counter_t;
-    volatile counter_t curr, total, total_allocs, peak;  // overflow is possible, CV_XADD operates with 'int' only
-public:
-    AllocatorStatistics()
-        : curr(0), total(0), total_allocs(0), peak(0)
-    {}
-    ~AllocatorStatistics() CV_OVERRIDE {}
-
-    uint64_t getCurrentUsage() const CV_OVERRIDE { return (uint64_t)curr; }
-    uint64_t getTotalUsage() const CV_OVERRIDE { return (uint64_t)total; }
-    uint64_t getNumberOfAllocations() const CV_OVERRIDE { return (uint64_t)total_allocs; }
-    uint64_t getPeakUsage() const CV_OVERRIDE { return (uint64_t)peak; }
-
-    void resetPeakUsage() CV_OVERRIDE { peak = curr; }
-
-    // Controller interface
-    void onAllocate(size_t sz)
-    {
-#ifdef CV__ALLOCATOR_STATS_LOG
-        CV__ALLOCATOR_STATS_LOG(cv::format("allocate: %lld (curr=%lld)", (long long int)sz, (long long int)curr));
-#endif
-
-        counter_t new_curr = (counter_t)CV_XADD(&curr, (counter_t)sz) + (counter_t)sz;
-
-        peak = std::max((counter_t)peak, new_curr);  // non-thread safe
-
-        //CV_XADD(&total, (uint64_t)sz);  // overflow with int, non-reliable...
-        total += sz;
-
-        CV_XADD(&total_allocs, (counter_t)1);
-    }
-    void onFree(size_t sz)
-    {
-#ifdef CV__ALLOCATOR_STATS_LOG
-        CV__ALLOCATOR_STATS_LOG(cv::format("free: %lld (curr=%lld)", (long long int)sz, (long long int)curr));
-#endif
-        CV_XADD(&curr, (counter_t)-sz);
-    }
-#endif
+#endif // OPENCV_DISABLE_ALLOCATOR_STATS
 };
 
 #ifdef CV__ALLOCATOR_STATS_LOG
diff --git a/modules/core/src/async.cpp b/modules/core/src/async.cpp
index 78c0a1ee81..3aeaaf7394 100644
--- a/modules/core/src/async.cpp
+++ b/modules/core/src/async.cpp
@@ -3,7 +3,6 @@
 // of this distribution and at http://opencv.org/license.html.
 
 #include "precomp.hpp"
-//#undef CV_CXX11  // debug non C++11 mode
 #include "opencv2/core/async.hpp"
 #include "opencv2/core/detail/async_promise.hpp"
 
@@ -16,11 +15,9 @@
 
 #ifndef OPENCV_DISABLE_THREAD_SUPPORT
 
-#ifdef CV_CXX11
 #include <mutex>
 #include <condition_variable>
 #include <chrono>
-#endif
 
 namespace cv {
 
@@ -37,12 +34,8 @@ struct AsyncArray::Impl
     void releasePromise() CV_NOEXCEPT { CV_XADD(&refcount_promise, -1); if(1 == CV_XADD(&refcount, -1)) delete this; } \
     int refcount_promise;
 
-#ifdef CV_CXX11
     mutable std::mutex mtx;
     mutable std::condition_variable cond_var;
-#else
-    mutable cv::Mutex mtx;
-#endif
 
     mutable bool has_result; // Mat, UMat or exception
 
@@ -88,11 +81,7 @@ struct AsyncArray::Impl
             if (!wait_for(timeoutNs))
                 return false;
         }
-#ifdef CV_CXX11
         std::unique_lock<std::mutex> lock(mtx);
-#else
-        cv::AutoLock lock(mtx);
-#endif
         if (has_result)
         {
             if (!result_mat.empty())
@@ -145,7 +134,6 @@ struct AsyncArray::Impl
         if (timeoutNs == 0)
             return has_result;
         CV_LOG_INFO(NULL, "Waiting for async result ...");
-#ifdef CV_CXX11
         std::unique_lock<std::mutex> lock(mtx);
         const auto cond_pred = [&]{ return has_result == true; };
         if (timeoutNs > 0)
@@ -156,9 +144,6 @@ struct AsyncArray::Impl
             CV_Assert(has_result);
             return true;
         }
-#else
-        CV_Error(Error::StsNotImplemented, "OpenCV has been built without async waiting support (C++11 is required)");
-#endif
     }
 
     AsyncArray getArrayResult()
@@ -175,11 +160,7 @@ struct AsyncArray::Impl
     {
         if (future_is_returned && refcount_future == 0)
             CV_Error(Error::StsError, "Associated AsyncArray has been destroyed");
-#ifdef CV_CXX11
         std::unique_lock<std::mutex> lock(mtx);
-#else
-        cv::AutoLock lock(mtx);
-#endif
         CV_Assert(!has_result);
         int k = value.kind();
         if (k == _InputArray::UMAT)
@@ -193,9 +174,7 @@ struct AsyncArray::Impl
             value.copyTo(*result_mat.get());
         }
         has_result = true;
-#ifdef CV_CXX11
         cond_var.notify_all();
-#endif
     }
 
 #if CV__EXCEPTION_PTR
@@ -203,18 +182,12 @@ struct AsyncArray::Impl
     {
         if (future_is_returned && refcount_future == 0)
             CV_Error(Error::StsError, "Associated AsyncArray has been destroyed");
-#ifdef CV_CXX11
         std::unique_lock<std::mutex> lock(mtx);
-#else
-        cv::AutoLock lock(mtx);
-#endif
         CV_Assert(!has_result);
         has_exception = true;
         exception = e;
         has_result = true;
-#ifdef CV_CXX11
         cond_var.notify_all();
-#endif
     }
 #endif
 
@@ -222,18 +195,12 @@ struct AsyncArray::Impl
     {
         if (future_is_returned && refcount_future == 0)
             CV_Error(Error::StsError, "Associated AsyncArray has been destroyed");
-#ifdef CV_CXX11
         std::unique_lock<std::mutex> lock(mtx);
-#else
-        cv::AutoLock lock(mtx);
-#endif
         CV_Assert(!has_result);
         has_exception = true;
         cv_exception = e;
         has_result = true;
-#ifdef CV_CXX11
         cond_var.notify_all();
-#endif
     }
 };
 
diff --git a/modules/core/src/matrix_wrap.cpp b/modules/core/src/matrix_wrap.cpp
index 5eb5d36a75..8db888c26d 100644
--- a/modules/core/src/matrix_wrap.cpp
+++ b/modules/core/src/matrix_wrap.cpp
@@ -1952,12 +1952,7 @@ void _OutputArray::move(UMat& u) const
     int k = kind();
     if (k == UMAT)
     {
-#ifdef CV_CXX11
         *(UMat*)obj = std::move(u);
-#else
-        *(UMat*)obj = u;
-        u.release();
-#endif
     }
     else if (k == MAT)
     {
@@ -1992,12 +1987,7 @@ void _OutputArray::move(Mat& m) const
     }
     else if (k == MAT)
     {
-#ifdef CV_CXX11
         *(Mat*)obj = std::move(m);
-#else
-        *(Mat*)obj = m;
-        m.release();
-#endif
     }
     else if (k == MATX)
     {
diff --git a/modules/core/src/parallel.cpp b/modules/core/src/parallel.cpp
index 395c7e5bd5..34030c9838 100644
--- a/modules/core/src/parallel.cpp
+++ b/modules/core/src/parallel.cpp
@@ -912,8 +912,7 @@ int getNumberOfCPUs_()
      * the minimum most value as it has high probablity of being right and safe.
      * Return 1 if we get 0 or not found on all methods.
     */
-#if defined CV_CXX11 \
-    && !defined(__MINGW32__) /* not implemented (2020-03) */ \
+#if !defined(__MINGW32__) /* not implemented (2020-03) */
 
     /*
      * Check for this standard C++11 way, we do not return directly because
diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp
index b4fb466bb0..5a586b6666 100644
--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@@ -120,11 +120,15 @@ void* allocSingletonNewBuffer(size_t size) { return malloc(size); }
 #include <cstdlib>        // std::abort
 #endif
 
-#if defined __ANDROID__ || defined __unix__ || defined __FreeBSD__ || defined __OpenBSD__ || defined __HAIKU__ || defined __Fuchsia__
+#if defined __ANDROID__ || defined __unix__ || defined __FreeBSD__ || defined __OpenBSD__ || defined __HAIKU__ || defined __Fuchsia__ || defined __QNX__
 #  include <unistd.h>
 #  include <fcntl.h>
 #if defined __QNX__
 #  include <sys/elf.h>
+#  include <sys/auxv.h>
+using Elf64_auxv_t = auxv64_t;
+#  include <elfdefinitions.h>
+const uint64_t AT_HWCAP = NT_GNU_HWCAP;
 #else
 #  include <elf.h>
 #endif
@@ -251,7 +255,7 @@ std::wstring GetTempFileNameWinRT(std::wstring prefix)
 #include "omp.h"
 #endif
 
-#if defined __unix__ || defined __APPLE__ || defined __EMSCRIPTEN__ || defined __FreeBSD__ || defined __GLIBC__ || defined __HAIKU__
+#if defined __unix__ || defined __APPLE__ || defined __EMSCRIPTEN__ || defined __FreeBSD__ || defined __OpenBSD__ || defined __GLIBC__ || defined __HAIKU__
 #include <unistd.h>
 #include <stdio.h>
 #include <sys/types.h>
@@ -301,9 +305,7 @@ DECLARE_CV_CPUID_X86
   #endif
 #endif
 
-#if defined CV_CXX11
-  #include <chrono>
-#endif
+#include <chrono>
 
 namespace cv
 {
@@ -562,7 +564,7 @@ struct HWFeatures
         }
     #endif // CV_CPUID_X86
 
-    #if defined __ANDROID__ || defined __linux__ || defined __FreeBSD__ || defined __QNX__
+    #if defined __ANDROID__ || defined __linux__ || defined __QNX__
     #ifdef __aarch64__
         have[CV_CPU_NEON] = true;
         have[CV_CPU_FP16] = true;
@@ -581,10 +583,12 @@ struct HWFeatures
                     have[CV_CPU_NEON_DOTPROD] = (auxv.a_un.a_val & (1 << 20)) != 0; // HWCAP_ASIMDDP
                     have[CV_CPU_NEON_FP16] = (auxv.a_un.a_val & (1 << 10)) != 0; // HWCAP_ASIMDHP
                 }
+#if defined(AT_HWCAP2)
                 else if (auxv.a_type == AT_HWCAP2)
                 {
                     have[CV_CPU_NEON_BF16] = (auxv.a_un.a_val & (1 << 14)) != 0; // HWCAP2_BF16
                 }
+#endif
             }
 
             close(cpufile);
@@ -611,7 +615,7 @@ struct HWFeatures
         CV_LOG_INFO(NULL, "- FP16 instructions is NOT enabled via build flags");
         #endif
       #endif
-    #elif defined __arm__ && !defined __FreeBSD__
+    #elif defined __arm__
         int cpufile = open("/proc/self/auxv", O_RDONLY);
 
         if (cpufile >= 0)
@@ -903,50 +907,15 @@ bool useOptimized(void)
 
 int64 getTickCount(void)
 {
-#if defined CV_CXX11
     std::chrono::steady_clock::time_point now = std::chrono::steady_clock::now();
     return (int64)now.time_since_epoch().count();
-#elif defined _WIN32 || defined WINCE
-    LARGE_INTEGER counter;
-    QueryPerformanceCounter( &counter );
-    return (int64)counter.QuadPart;
-#elif defined __MACH__ && defined __APPLE__
-    return (int64)mach_absolute_time();
-#elif defined __unix__
-    struct timespec tp;
-    clock_gettime(CLOCK_MONOTONIC, &tp);
-    return (int64)tp.tv_sec*1000000000 + tp.tv_nsec;
-#else
-    struct timeval tv;
-    gettimeofday(&tv, NULL);
-    return (int64)tv.tv_sec*1000000 + tv.tv_usec;
-#endif
 }
 
 double getTickFrequency(void)
 {
-#if defined CV_CXX11
     using clock_period_t = std::chrono::steady_clock::duration::period;
     double clock_freq = clock_period_t::den / clock_period_t::num;
     return clock_freq;
-#elif defined _WIN32 || defined WINCE
-    LARGE_INTEGER freq;
-    QueryPerformanceFrequency(&freq);
-    return (double)freq.QuadPart;
-#elif defined __MACH__ && defined __APPLE__
-    static double freq = 0;
-    if( freq == 0 )
-    {
-        mach_timebase_info_data_t sTimebaseInfo;
-        mach_timebase_info(&sTimebaseInfo);
-        freq = sTimebaseInfo.denom*1e9/sTimebaseInfo.numer;
-    }
-    return freq;
-#elif defined __unix__
-    return 1e9;
-#else
-    return 1e6;
-#endif
 }
 
 #if defined __GNUC__ && (defined __i386__ || defined __x86_64__ || defined __ppc__)
diff --git a/modules/core/test/test_async.cpp b/modules/core/test/test_async.cpp
index 58bcfddcd7..2fcee300cf 100644
--- a/modules/core/test/test_async.cpp
+++ b/modules/core/test/test_async.cpp
@@ -7,7 +7,7 @@
 
 #include <opencv2/core/bindings_utils.hpp>
 
-#if defined(CV_CXX11) && !defined(OPENCV_DISABLE_THREAD_SUPPORT)
+#if !defined(OPENCV_DISABLE_THREAD_SUPPORT)
 #include <thread>
 #include <chrono>
 #endif
@@ -85,7 +85,7 @@ TEST(Core_Async, LikePythonTest)
 }
 
 
-#if defined(CV_CXX11) && !defined(OPENCV_DISABLE_THREAD_SUPPORT)
+#if !defined(OPENCV_DISABLE_THREAD_SUPPORT)
 
 TEST(Core_Async, AsyncThread_Simple)
 {
diff --git a/modules/core/test/test_misc.cpp b/modules/core/test/test_misc.cpp
index fef8cb839f..827f9185db 100644
--- a/modules/core/test/test_misc.cpp
+++ b/modules/core/test/test_misc.cpp
@@ -8,10 +8,8 @@
 
 #include <opencv2/core/utils/fp_control_utils.hpp>
 
-#ifdef CV_CXX11
 #include <chrono>
 #include <thread>
-#endif
 
 namespace opencv_test { namespace {
 
@@ -282,9 +280,7 @@ public:
             // FP state is not supported
             // no checks
         }
-#ifdef CV_CXX11
         std::this_thread::sleep_for(std::chrono::milliseconds(100));
-#endif
     }
 
     cv::details::FPDenormalsModeState base_state;
diff --git a/modules/core/test/test_precomp.hpp b/modules/core/test/test_precomp.hpp
index 8d9a931db4..4e109b3d75 100644
--- a/modules/core/test/test_precomp.hpp
+++ b/modules/core/test/test_precomp.hpp
@@ -4,6 +4,8 @@
 #ifndef __OPENCV_TEST_PRECOMP_HPP__
 #define __OPENCV_TEST_PRECOMP_HPP__
 
+#include <array>
+
 #include "opencv2/ts.hpp"
 #include "opencv2/ts/ocl_test.hpp"
 #include "opencv2/core/private.hpp"
diff --git a/modules/core/test/test_utils_tls.impl.hpp b/modules/core/test/test_utils_tls.impl.hpp
index 36b8805422..20facabadd 100644
--- a/modules/core/test/test_utils_tls.impl.hpp
+++ b/modules/core/test/test_utils_tls.impl.hpp
@@ -4,9 +4,7 @@
 
 // This is .hpp file included from test_utils.cpp
 
-#ifdef CV_CXX11
 #include <thread>  // std::thread
-#endif
 
 #include "opencv2/core/utils/tls.hpp"
 
@@ -34,8 +32,6 @@ public:
 int TLSReporter::g_last_id = 0;
 int TLSReporter::g_allocated = 0;
 
-#ifdef CV_CXX11
-
 template<typename T>
 static void callNThreadsWithTLS(int N, TLSData<T>& tls)
 {
@@ -129,6 +125,4 @@ static void testTLSAccumulator(bool detachFirst)
 TEST(Core_TLS, AccumulatorHoldData_detachData) { testTLSAccumulator(true); }
 TEST(Core_TLS, AccumulatorHoldData_gather) { testTLSAccumulator(false); }
 
-#endif
-
 }}  // namespace
diff --git a/modules/dnn/include/opencv2/dnn/all_layers.hpp b/modules/dnn/include/opencv2/dnn/all_layers.hpp
index 95982f6fb0..1f495d33bb 100644
--- a/modules/dnn/include/opencv2/dnn/all_layers.hpp
+++ b/modules/dnn/include/opencv2/dnn/all_layers.hpp
@@ -1183,6 +1183,11 @@ CV__DNN_INLINE_NS_BEGIN
         static Ptr<AttentionLayer> create(const LayerParams &params);
     };
 
+    class CV_EXPORTS GroupNormLayer : public Layer {
+    public:
+        static Ptr<GroupNormLayer> create(const LayerParams &params);
+    };
+
 //! @}
 //! @}
 CV__DNN_INLINE_NS_END
diff --git a/modules/dnn/include/opencv2/dnn/dnn.hpp b/modules/dnn/include/opencv2/dnn/dnn.hpp
index cf034db6f0..e239e4342c 100644
--- a/modules/dnn/include/opencv2/dnn/dnn.hpp
+++ b/modules/dnn/include/opencv2/dnn/dnn.hpp
@@ -444,7 +444,7 @@ CV__DNN_INLINE_NS_BEGIN
          *  Networks imported from Intel's Model Optimizer are launched in Intel's Inference Engine
          *  backend.
          */
-        CV_WRAP static Net readFromModelOptimizer(const String& xml, const String& bin);
+        CV_WRAP static Net readFromModelOptimizer(CV_WRAP_FILE_PATH const String& xml, CV_WRAP_FILE_PATH const String& bin);
 
         /** @brief Create a network from Intel's Model Optimizer in-memory buffers with intermediate representation (IR).
          *  @param[in] bufferModelConfig buffer with model's configuration.
@@ -477,7 +477,7 @@ CV__DNN_INLINE_NS_BEGIN
          *  @param path   path to output file with .dot extension
          *  @see dump()
          */
-        CV_WRAP void dumpToFile(const String& path);
+        CV_WRAP void dumpToFile(CV_WRAP_FILE_PATH const String& path);
         /** @brief Adds new layer to the net.
          *  @param name   unique name of the adding layer.
          *  @param type   typename of the adding layer (type must be registered in LayerRegister).
@@ -839,7 +839,7 @@ CV__DNN_INLINE_NS_BEGIN
     *  @param darknetModel path to the .weights file with learned network.
     *  @returns Network object that ready to do forward, throw an exception in failure cases.
     */
-    CV_EXPORTS_W Net readNetFromDarknet(const String &cfgFile, const String &darknetModel = String());
+    CV_EXPORTS_W Net readNetFromDarknet(CV_WRAP_FILE_PATH const String &cfgFile, CV_WRAP_FILE_PATH const String &darknetModel = String());
 
     /** @brief Reads a network model stored in <a href="https://pjreddie.com/darknet/">Darknet</a> model files.
      *  @param bufferCfg   A buffer contains a content of .cfg file with text description of the network architecture.
@@ -864,7 +864,7 @@ CV__DNN_INLINE_NS_BEGIN
       * @param caffeModel path to the .caffemodel file with learned network.
       * @returns Net object.
       */
-    CV_EXPORTS_W Net readNetFromCaffe(const String &prototxt, const String &caffeModel = String());
+    CV_EXPORTS_W Net readNetFromCaffe(CV_WRAP_FILE_PATH const String &prototxt, CV_WRAP_FILE_PATH const String &caffeModel = String());
 
     /** @brief Reads a network model stored in Caffe model in memory.
       * @param bufferProto buffer containing the content of the .prototxt file
@@ -893,7 +893,7 @@ CV__DNN_INLINE_NS_BEGIN
       *               let us make it more flexible.
       * @returns Net object.
       */
-    CV_EXPORTS_W Net readNetFromTensorflow(const String &model, const String &config = String());
+    CV_EXPORTS_W Net readNetFromTensorflow(CV_WRAP_FILE_PATH const String &model, CV_WRAP_FILE_PATH const String &config = String());
 
     /** @brief Reads a network model stored in <a href="https://www.tensorflow.org/">TensorFlow</a> framework's format.
       * @param bufferModel buffer containing the content of the pb file
@@ -918,7 +918,7 @@ CV__DNN_INLINE_NS_BEGIN
       * @param model  path to the .tflite file with binary flatbuffers description of the network architecture
       * @returns Net object.
       */
-    CV_EXPORTS_W Net readNetFromTFLite(const String &model);
+    CV_EXPORTS_W Net readNetFromTFLite(CV_WRAP_FILE_PATH const String &model);
 
     /** @brief Reads a network model stored in <a href="https://www.tensorflow.org/lite">TFLite</a> framework's format.
       * @param bufferModel buffer containing the content of the tflite file
@@ -957,7 +957,7 @@ CV__DNN_INLINE_NS_BEGIN
       * or @ref readNetFromDarknet. An order of @p model and @p config
       * arguments does not matter.
       */
-     CV_EXPORTS_W Net readNet(const String& model, const String& config = "", const String& framework = "");
+     CV_EXPORTS_W Net readNet(CV_WRAP_FILE_PATH const String& model, CV_WRAP_FILE_PATH const String& config = "", const String& framework = "");
 
      /**
       * @brief Read deep learning network represented in one of the supported formats.
@@ -979,7 +979,7 @@ CV__DNN_INLINE_NS_BEGIN
      *  backend.
      */
     CV_EXPORTS_W
-    Net readNetFromModelOptimizer(const String &xml, const String &bin = "");
+    Net readNetFromModelOptimizer(CV_WRAP_FILE_PATH const String &xml, CV_WRAP_FILE_PATH const String &bin = "");
 
     /** @brief Load a network from Intel's Model Optimizer intermediate representation.
      *  @param[in] bufferModelConfig Buffer contains XML configuration with network's topology.
@@ -1008,7 +1008,7 @@ CV__DNN_INLINE_NS_BEGIN
      *  @param onnxFile path to the .onnx file with text description of the network architecture.
      *  @returns Network object that ready to do forward, throw an exception in failure cases.
      */
-    CV_EXPORTS_W Net readNetFromONNX(const String &onnxFile);
+    CV_EXPORTS_W Net readNetFromONNX(CV_WRAP_FILE_PATH const String &onnxFile);
 
     /** @brief Reads a network model from <a href="https://onnx.ai/">ONNX</a>
      *         in-memory buffer.
@@ -1031,7 +1031,7 @@ CV__DNN_INLINE_NS_BEGIN
      *  @param path to the .pb file with input tensor.
      *  @returns Mat.
      */
-    CV_EXPORTS_W Mat readTensorFromONNX(const String& path);
+    CV_EXPORTS_W Mat readTensorFromONNX(CV_WRAP_FILE_PATH const String& path);
 
     /** @brief Creates 4-dimensional blob from image. Optionally resizes and crops @p image from center,
      *  subtract @p mean values, scales values by @p scalefactor, swap Blue and Red channels.
@@ -1204,7 +1204,7 @@ CV__DNN_INLINE_NS_BEGIN
      *       is taken from NVidia's Caffe fork: https://github.com/NVIDIA/caffe.
      *       So the resulting model may be used there.
      */
-    CV_EXPORTS_W void shrinkCaffeModel(const String& src, const String& dst,
+    CV_EXPORTS_W void shrinkCaffeModel(CV_WRAP_FILE_PATH const String& src, CV_WRAP_FILE_PATH const String& dst,
                                        const std::vector<String>& layersTypes = std::vector<String>());
 
     /** @brief Create a text representation for a binary network stored in protocol buffer format.
@@ -1213,7 +1213,7 @@ CV__DNN_INLINE_NS_BEGIN
      *
      *  @note To reduce output file size, trained weights are not included.
      */
-    CV_EXPORTS_W void writeTextGraph(const String& model, const String& output);
+    CV_EXPORTS_W void writeTextGraph(CV_WRAP_FILE_PATH const String& model, CV_WRAP_FILE_PATH const String& output);
 
     /** @brief Performs non maximum suppression given boxes and corresponding scores.
 
@@ -1318,7 +1318,7 @@ CV__DNN_INLINE_NS_BEGIN
           * @param[in] model Binary file contains trained weights.
           * @param[in] config Text file contains network configuration.
           */
-         CV_WRAP Model(const String& model, const String& config = "");
+         CV_WRAP Model(CV_WRAP_FILE_PATH const String& model, CV_WRAP_FILE_PATH const String& config = "");
 
          /**
           * @brief Create model from deep learning network.
@@ -1423,7 +1423,7 @@ CV__DNN_INLINE_NS_BEGIN
           * @param[in] model Binary file contains trained weights.
           * @param[in] config Text file contains network configuration.
           */
-          CV_WRAP ClassificationModel(const String& model, const String& config = "");
+          CV_WRAP ClassificationModel(CV_WRAP_FILE_PATH const String& model, CV_WRAP_FILE_PATH const String& config = "");
 
          /**
           * @brief Create model from deep learning network.
@@ -1473,7 +1473,7 @@ CV__DNN_INLINE_NS_BEGIN
           * @param[in] model Binary file contains trained weights.
           * @param[in] config Text file contains network configuration.
           */
-          CV_WRAP KeypointsModel(const String& model, const String& config = "");
+          CV_WRAP KeypointsModel(CV_WRAP_FILE_PATH const String& model, CV_WRAP_FILE_PATH const String& config = "");
 
          /**
           * @brief Create model from deep learning network.
@@ -1505,7 +1505,7 @@ CV__DNN_INLINE_NS_BEGIN
           * @param[in] model Binary file contains trained weights.
           * @param[in] config Text file contains network configuration.
           */
-          CV_WRAP SegmentationModel(const String& model, const String& config = "");
+          CV_WRAP SegmentationModel(CV_WRAP_FILE_PATH const String& model, CV_WRAP_FILE_PATH const String& config = "");
 
          /**
           * @brief Create model from deep learning network.
@@ -1536,7 +1536,7 @@ CV__DNN_INLINE_NS_BEGIN
           * @param[in] model Binary file contains trained weights.
           * @param[in] config Text file contains network configuration.
           */
-         CV_WRAP DetectionModel(const String& model, const String& config = "");
+         CV_WRAP DetectionModel(CV_WRAP_FILE_PATH const String& model, CV_WRAP_FILE_PATH const String& config = "");
 
          /**
           * @brief Create model from deep learning network.
@@ -1602,7 +1602,7 @@ public:
      * @param[in] config Text file contains network configuration
      */
     CV_WRAP inline
-    TextRecognitionModel(const std::string& model, const std::string& config = "")
+    TextRecognitionModel(CV_WRAP_FILE_PATH const std::string& model, CV_WRAP_FILE_PATH const std::string& config = "")
         : TextRecognitionModel(readNet(model, config)) { /* nothing */ }
 
     /**
@@ -1757,7 +1757,7 @@ public:
      * @param[in] config Text file contains network configuration.
      */
     CV_WRAP inline
-    TextDetectionModel_EAST(const std::string& model, const std::string& config = "")
+    TextDetectionModel_EAST(CV_WRAP_FILE_PATH const std::string& model, CV_WRAP_FILE_PATH const std::string& config = "")
         : TextDetectionModel_EAST(readNet(model, config)) { /* nothing */ }
 
     /**
@@ -1818,7 +1818,7 @@ public:
      * @param[in] config Text file contains network configuration.
      */
     CV_WRAP inline
-    TextDetectionModel_DB(const std::string& model, const std::string& config = "")
+    TextDetectionModel_DB(CV_WRAP_FILE_PATH const std::string& model, CV_WRAP_FILE_PATH const std::string& config = "")
         : TextDetectionModel_DB(readNet(model, config)) { /* nothing */ }
 
     CV_WRAP TextDetectionModel_DB& setBinaryThreshold(float binaryThreshold);
diff --git a/modules/dnn/perf/perf_layer.cpp b/modules/dnn/perf/perf_layer.cpp
index 66b5ad62c2..27fe7d1504 100644
--- a/modules/dnn/perf/perf_layer.cpp
+++ b/modules/dnn/perf/perf_layer.cpp
@@ -258,175 +258,163 @@ PERF_TEST_P_(Layer_Slice, FastNeuralStyle_eccv16)
     test_slice<4>(inputShape, begin, end);
 }
 
-struct Layer_Scatter : public TestBaseWithParam<tuple<Backend, Target> >
-{
-    void test_layer(const std::vector<int>& shape, const String reduction = "none", int axis = 0)
+using Layer_Scatter = TestBaseWithParam<tuple<std::vector<int>, std::string, int, tuple<Backend, Target>>>;
+PERF_TEST_P_(Layer_Scatter, scatter) {
+    std::vector<int> shape = get<0>(GetParam());
+    std::string reduction = get<1>(GetParam());
+    int axis = get<2>(GetParam());
+    int backend_id = get<0>(get<3>(GetParam()));
+    int target_id = get<1>(get<3>(GetParam()));
+
+    Mat data(shape, CV_32FC1);
+    Mat indices(shape, CV_32FC1);
+    Mat updates(shape, CV_32FC1);
+
+    randn(data, 0.f, 1.f);
+    randu(indices, 0, shape[axis]);
+    randn(updates, 0.f, 1.f);
+
+    indices.convertTo(indices, CV_32SC1, 1, -1);
+
+    Net net;
+    LayerParams lp;
+    lp.type = "Scatter";
+    lp.name = "testLayer";
+    lp.set("reduction", reduction);
+    lp.set("axis", axis);
+
+    int id = net.addLayerToPrev(lp.name, lp.type, lp);
+    net.connect(0, 0, id, 0);
+    net.connect(0, 1, id, 1);
+    net.connect(0, 2, id, 2);
+
+    // warmup
     {
-        int backendId = get<0>(GetParam());
-        int targetId = get<1>(GetParam());
+        std::vector<String> input_names{"data", "indices", "updates"};
+        net.setInputsNames(input_names);
+        net.setInput(data, input_names[0]);
+        net.setInput(indices, input_names[1]);
+        net.setInput(updates, input_names[2]);
 
-        Mat data(shape, CV_32FC1);
-        Mat indices(shape, CV_32FC1);
-        Mat updates(shape, CV_32FC1);
-
-        Scalar mean = 0.f;
-        Scalar std = 1.f;
-        randn(data, mean, std);
-        randu(indices, 0, shape[axis]);
-        randn(updates, mean, std);
-
-        indices.convertTo(indices, CV_32SC1, 1, -1);
-
-        Net net;
-        LayerParams lp;
-        lp.type = "Scatter";
-        lp.name = "testLayer";
-        lp.set("reduction", reduction);
-        lp.set("axis", axis);
-
-        int id = net.addLayerToPrev(lp.name, lp.type, lp);
-        net.connect(0, 0, id, 0);
-        net.connect(0, 1, id, 1);
-        net.connect(0, 2, id, 2);
-
-        // warmup
-        {
-            std::vector<String> inpNames(3);
-            inpNames[0] = "data";
-            inpNames[1] = "indices";
-            inpNames[2] = "updates";
-            net.setInputsNames(inpNames);
-            net.setInput(data, inpNames[0]);
-            net.setInput(indices, inpNames[1]);
-            net.setInput(updates, inpNames[2]);
-
-            net.setPreferableBackend(backendId);
-            net.setPreferableTarget(targetId);
-            Mat out = net.forward();
-        }
-
-        TEST_CYCLE()
-        {
-            Mat res = net.forward();
-        }
-
-        SANITY_CHECK_NOTHING();
+        net.setPreferableBackend(backend_id);
+        net.setPreferableTarget(target_id);
+        Mat out = net.forward();
     }
 
-    int N = 8;
-    int C = 256;
-    int H = 128;
-    int W = 100;
-};
-
-PERF_TEST_P_(Layer_Scatter, DISABLED_Scatter)
-{
-    test_layer({N, C, H, W});
-}
-
-PERF_TEST_P_(Layer_Scatter, DISABLED_Scatter_add)
-{
-    test_layer({N, C, H, W}, "add");
-}
-
-struct Layer_ScatterND : public TestBaseWithParam<tuple<Backend, Target> >
-{
-    void test_layer(const std::vector<int>& shape, const String reduction = "none")
+    // perf
+    TEST_CYCLE()
     {
-        int backendId = get<0>(GetParam());
-        int targetId = get<1>(GetParam());
-
-        std::vector<int> indices_shape(shape);
-        indices_shape.push_back(int(shape.size()));
-        Mat data(shape, CV_32FC1);
-        Mat indices(indices_shape, CV_32FC1);
-        Mat updates(shape, CV_32FC1);
-
-        Scalar mean = 0.f;
-        Scalar std = 1.f;
-        randn(data, mean, std);
-        randn(updates, mean, std);
-
-        // initialize the indices with index tuples like [0...N, 0...C, 0...H, 0...W]
-        std::vector<int> current_index_tuple(shape.size());
-        int total = data.total();
-        std::vector<int> indices_step;
-        for (int i = 0; i < indices.dims; i++)
-        {
-            int step = indices.step.p[i] / sizeof(float);
-            indices_step.push_back(step);
-        }
-        int t, j, idx, offset_at_idx, offset;
-        for (int i = 0; i < total; i++)
-        {
-            t = i;
-            for (j = shape.size() - 1; j >= 0; j--)
-            {
-                idx = t / shape[j];
-                offset_at_idx = (int)(t - idx * shape[j]);
-                current_index_tuple[j] = offset_at_idx;
-                t = idx;
-            }
-
-            offset = 0;
-            for (j = 0; j < shape.size(); j++)
-                offset += current_index_tuple[j] * indices_step[j];
-
-            for (j = 0; j < shape.size(); j++)
-                indices.at<float>(offset + j) = current_index_tuple[j];
-        }
-
-        Net net;
-        LayerParams lp;
-        lp.type = "ScatterND";
-        lp.name = "testLayer";
-        lp.set("reduction", reduction);
-
-        int id = net.addLayerToPrev(lp.name, lp.type, lp);
-        net.connect(0, 0, id, 0);
-        net.connect(0, 1, id, 1);
-        net.connect(0, 2, id, 2);
-
-        // warmup
-        {
-            std::vector<String> inpNames(3);
-            inpNames[0] = "data";
-            inpNames[1] = "indices";
-            inpNames[2] = "updates";
-            net.setInputsNames(inpNames);
-            net.setInput(data, inpNames[0]);
-            net.setInput(indices, inpNames[1]);
-            net.setInput(updates, inpNames[2]);
-
-            net.setPreferableBackend(backendId);
-            net.setPreferableTarget(targetId);
-            Mat out = net.forward();
-        }
-
-        TEST_CYCLE()
-        {
-            Mat res = net.forward();
-        }
-
-        SANITY_CHECK_NOTHING();
+        Mat res = net.forward();
     }
 
-    int N = 8;
-    int C = 256;
-    int H = 128;
-    int W = 100;
-};
-
-PERF_TEST_P_(Layer_ScatterND, DISABLED_ScatterND)
-{
-    test_layer({N, C, H ,W});
+    SANITY_CHECK_NOTHING();
 }
 
-PERF_TEST_P_(Layer_ScatterND, DISABLED_ScatterND_add)
-{
-    test_layer({N, C, H , W}, "add");
+INSTANTIATE_TEST_CASE_P(/**/, Layer_Scatter, Combine(
+    Values(std::vector<int>{2, 128, 64, 50}),
+    Values(std::string("none"), std::string("add")),
+    Values(0), // use Values(0, 1, 2, 3) for more details
+    dnnBackendsAndTargets(/* withInferenceEngine= */ false,
+                          /* withHalide= */          false,
+                          /* withCpuOCV= */          true,
+                          /* withVkCom= */           false,
+                          /* withCUDA= */            false,
+                          /* withNgraph= */          false,
+                          /* withWebnn= */           false,
+                          /* withCann= */            false) // only test on CPU
+));
+
+using Layer_ScatterND = TestBaseWithParam<tuple<std::vector<int>, std::string, tuple<Backend, Target>>>;
+PERF_TEST_P_(Layer_ScatterND, scatterND) {
+    std::vector<int> shape = get<0>(GetParam());
+    std::string reduction = get<1>(GetParam());
+    int backend_id = get<0>(get<2>(GetParam()));
+    int target_id = get<1>(get<2>(GetParam()));
+
+    std::vector<int> indices_shape(shape);
+    indices_shape.push_back(int(shape.size()));
+    Mat data(shape, CV_32FC1);
+    Mat indices(indices_shape, CV_32FC1);
+    Mat updates(shape, CV_32FC1);
+
+    randn(data, 0.f, 1.f);
+    randn(updates, 0.f, 1.f);
+
+    // Create indices such that indices[n_i, c_j, h_k, w_l, :4] = [i, j, k, l]
+    std::vector<int> current_index_tuple(shape.size());
+    int total = data.total();
+    std::vector<int> indices_step;
+    for (int i = 0; i < indices.dims; i++)
+    {
+        int step = indices.step.p[i] / sizeof(float);
+        indices_step.push_back(step);
+    }
+    int t, j, idx, offset_at_idx, offset;
+    auto *indices_ptr = indices.ptr<float>();
+    for (int i = 0; i < total; i++)
+    {
+        t = i;
+        for (j = shape.size() - 1; j >= 0; j--)
+        {
+            idx = t / shape[j];
+            offset_at_idx = (int)(t - idx * shape[j]);
+            current_index_tuple[j] = offset_at_idx;
+            t = idx;
+        }
+
+        offset = 0;
+        for (j = 0; j < shape.size(); j++)
+            offset += current_index_tuple[j] * indices_step[j];
+
+        for (j = 0; j < shape.size(); j++)
+            indices_ptr[offset + j] = current_index_tuple[j];
+    }
+
+    Net net;
+    LayerParams lp;
+    lp.type = "ScatterND";
+    lp.name = "testLayer";
+    lp.set("reduction", reduction);
+
+    int id = net.addLayerToPrev(lp.name, lp.type, lp);
+    net.connect(0, 0, id, 0);
+    net.connect(0, 1, id, 1);
+    net.connect(0, 2, id, 2);
+
+    // warmup
+    {
+        std::vector<String> input_names{"data", "indices", "updates"};
+        net.setInputsNames(input_names);
+        net.setInput(data, input_names[0]);
+        net.setInput(indices, input_names[1]);
+        net.setInput(updates, input_names[2]);
+
+        net.setPreferableBackend(backend_id);
+        net.setPreferableTarget(target_id);
+        Mat out = net.forward();
+    }
+
+    TEST_CYCLE()
+    {
+        Mat res = net.forward();
+    }
+
+    SANITY_CHECK_NOTHING();
 }
 
+INSTANTIATE_TEST_CASE_P(/**/, Layer_ScatterND, Combine(
+    Values(std::vector<int>{2, 128, 64, 50}),
+    Values(std::string("none"), std::string("add")),
+    dnnBackendsAndTargets(/* withInferenceEngine= */ false,
+                          /* withHalide= */          false,
+                          /* withCpuOCV= */          true,
+                          /* withVkCom= */           false,
+                          /* withCUDA= */            false,
+                          /* withNgraph= */          false,
+                          /* withWebnn= */           false,
+                          /* withCann= */            false) // only test on CPU
+));
+
 struct Layer_LayerNorm : public TestBaseWithParam<tuple<Backend, Target> >
 {
     void test_layer(const std::vector<int>& x_shape)
@@ -795,19 +783,77 @@ PERF_TEST_P_(Layer_Attention, VisionTransformer) {
     test_layer({1, 197, 768}, {768, 768, 768}, 12);
 }
 
+struct Layer_GroupNorm : public TestBaseWithParam<tuple<Backend, Target> >
+{
+    void test_layer(const std::vector<int>& x_shape, int num_groups)
+    {
+        int backendId = get<0>(GetParam());
+        int targetId = get<1>(GetParam());
+
+        Mat x(x_shape, CV_32FC1);
+        Mat scale(x_shape[1], 1, CV_32FC1);
+        Mat b(x_shape[1], 1, CV_32FC1);
+
+        randu(x, 0.f, 1.f);
+        randu(scale, 0.f, 1.f);
+        randu(b, 0.f, 1.f);
+
+        Net net;
+        LayerParams lp;
+        lp.type = "GroupNormalization";
+        lp.name = "testLayer";
+        lp.set("num_groups", num_groups);
+
+        int id = net.addLayerToPrev(lp.name, lp.type, lp);
+        net.connect(0, 0, id, 0);
+        net.connect(0, 1, id, 1);
+        net.connect(0, 2, id, 2);
+
+        // warmup
+        {
+            std::vector<String> inpNames{"x", "scale", "b"};
+            net.setInputsNames(inpNames);
+            net.setInput(x, inpNames[0]);
+            net.setInput(scale, inpNames[1]);
+            net.setInput(b, inpNames[2]);
+
+            net.setPreferableBackend(backendId);
+            net.setPreferableTarget(targetId);
+            Mat out = net.forward();
+        }
+
+        TEST_CYCLE()
+        {
+            Mat res = net.forward();
+        }
+
+        SANITY_CHECK_NOTHING();
+    }
+
+    int N = 2;
+    int C = 64;
+    int H = 180;
+    int W = 240;
+    int num_groups = 16;
+};
+
+PERF_TEST_P_(Layer_GroupNorm, GroupNorm)
+{
+    test_layer({N, C, H, W}, num_groups);
+}
+
+
 INSTANTIATE_TEST_CASE_P(/**/, Layer_Slice, dnnBackendsAndTargets(false, false));
 INSTANTIATE_TEST_CASE_P(/**/, Layer_NaryEltwise, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU)));
 #ifdef HAVE_CUDA
 INSTANTIATE_TEST_CASE_P(CUDA, Layer_NaryEltwise, testing::Values(std::make_tuple(DNN_BACKEND_CUDA, DNN_TARGET_CUDA)));
 #endif
-INSTANTIATE_TEST_CASE_P(/**/, Layer_Scatter, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU)));
-INSTANTIATE_TEST_CASE_P(/**/, Layer_ScatterND, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU)));
 INSTANTIATE_TEST_CASE_P(/**/, Layer_LayerNorm, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU)));
 INSTANTIATE_TEST_CASE_P(/**/, Layer_LayerNormExpanded, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU)));
 INSTANTIATE_TEST_CASE_P(/**/, Layer_GatherElements, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU)));
 INSTANTIATE_TEST_CASE_P(/**/, Layer_InstanceNorm, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU)));
 INSTANTIATE_TEST_CASE_P(/**/, Layer_Attention, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU)));
-
+INSTANTIATE_TEST_CASE_P(/**/, Layer_GroupNorm, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU)));
 
 typedef TestBaseWithParam<tuple<Vec4i, int, bool, tuple<Backend, Target> > > Layer_FullyConnected;
 PERF_TEST_P_(Layer_FullyConnected, fc)
diff --git a/modules/dnn/src/cuda/eltwise_ops.cu b/modules/dnn/src/cuda/eltwise_ops.cu
index 16f6cccf6b..e2a7cc9a67 100644
--- a/modules/dnn/src/cuda/eltwise_ops.cu
+++ b/modules/dnn/src/cuda/eltwise_ops.cu
@@ -132,8 +132,23 @@ void eltwise_op(const Stream& stream, TensorSpan<T> output, TensorView<T> x, Ten
     }
     else
     {
-        CV_Assert(is_shape_compatible(output, x));
-        CV_Assert(is_shape_compatible(output, y));
+        auto inShape1 = x.shape_as_vector();
+        auto inShape2 = y.shape_as_vector();
+        auto outShape = output.shape_as_vector();
+
+        std::size_t x_ndims = inShape1.size(), y_ndims = inShape2.size();
+        if (x_ndims >= y_ndims) {
+            for (std::size_t i = 0; i < (x_ndims - y_ndims); i++) {
+               inShape2.insert(inShape2.begin(), 1);
+            }
+        } else {
+            for (std::size_t i = 0; i < (y_ndims - x_ndims); i++) {
+               inShape1.insert(inShape1.begin(), 1);
+            }
+        }
+
+        CV_Assert(is_shape_compatible1(outShape, inShape1));
+        CV_Assert(is_shape_compatible1(outShape, inShape2));
 
         /* matching singleton axes in both input tensors can be eliminated
          *
@@ -148,20 +163,21 @@ void eltwise_op(const Stream& stream, TensorSpan<T> output, TensorView<T> x, Ten
          * x: [1, 256, 32, 32] -> [256, 32, 32]
          * y: [1, 256, 1, 1] -> [256, 1, 1]
          */
-        for (int r = 0; r < output.rank(); r++)
-        {
-            while (x.rank() > r && y.rank() > r && x.get_axis_size(r) == 1 && y.get_axis_size(r) == 1) {
-                CV_Assert(output.get_axis_size(r) == 1);
-
-                x.squeeze(r);
-                y.squeeze(r);
-                output.squeeze(r);
+        int eliminate_times = 0;
+        for (std::size_t i = 0; i < outShape.size(); i++) {
+            if (inShape1[i] == 1 && inShape2[i] == 1 && outShape[i] == 1 && i != (outShape.size() - 1)) {
+                eliminate_times++;
+            } else {
+                break;
+            }
+        }
+        if (eliminate_times > 0) {
+            for (int i = 0; i < eliminate_times; i++) {
+                inShape1.erase(inShape1.begin());
+                inShape2.erase(inShape2.begin());
+                outShape.erase(outShape.begin());
             }
         }
-
-        auto inShape1 = x.shape_as_vector();
-        auto inShape2 = y.shape_as_vector();
-        auto outShape = output.shape_as_vector();
 
         /* contiguous axes that do not broadcast can be merged into one axis
          *
@@ -324,7 +340,19 @@ void eltwise_sub_2(const Stream& stream, TensorSpan<T> output, TensorView<T> x,
     eltwise_op<T, SubFunctor<T>>(stream, output, x, y);
 }
 
+template <class T>
+void eltwise_mod_2(const Stream& stream, TensorSpan<T> output, TensorView<T> x, TensorView<T> y) {
+    eltwise_op<T, ModFunctor<T>>(stream, output, x, y);
+}
+
+template <class T>
+void eltwise_fmod_2(const Stream& stream, TensorSpan<T> output, TensorView<T> x, TensorView<T> y) {
+    eltwise_op<T, FModFunctor<T>>(stream, output, x, y);
+}
+
 #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template void eltwise_mod_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y);
+    template void eltwise_fmod_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y);
     template void eltwise_sub_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y);
     template void eltwise_div_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y);
     template void eltwise_prod_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y);
@@ -333,6 +361,8 @@ void eltwise_sub_2(const Stream& stream, TensorSpan<T> output, TensorView<T> x,
     template void eltwise_max_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y);
     template void eltwise_min_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y);
 #endif
+    template void eltwise_mod_2(const Stream& stream, TensorSpan<float> output, TensorView<float> x, TensorView<float> y);
+    template void eltwise_fmod_2(const Stream& stream, TensorSpan<float> output, TensorView<float> x, TensorView<float> y);
     template void eltwise_sub_2(const Stream& stream, TensorSpan<float> output, TensorView<float> x, TensorView<float> y);
     template void eltwise_div_2(const Stream& stream, TensorSpan<float> output, TensorView<float> x, TensorView<float> y);
     template void eltwise_prod_2(const Stream& stream, TensorSpan<float> output, TensorView<float> x, TensorView<float> y);
diff --git a/modules/dnn/src/cuda/functors.hpp b/modules/dnn/src/cuda/functors.hpp
index 2df32030f0..cada43387e 100644
--- a/modules/dnn/src/cuda/functors.hpp
+++ b/modules/dnn/src/cuda/functors.hpp
@@ -799,6 +799,40 @@ struct ReciprocalFunctor {
     }
 };
 
+template <class T>
+struct ModFunctor {
+    struct Params {
+        CUDA4DNN_HOST_DEVICE Params() {}
+    };
+
+    CUDA4DNN_DEVICE ModFunctor() { }
+    CUDA4DNN_DEVICE ModFunctor(const Params& params) { }
+
+    CUDA4DNN_DEVICE T operator()(T x, T y) {
+        int res = (int)x % (int)y;
+        T zero = T(0);
+        if ((res > (int)zero && y < zero) || (res < (int)zero && y > zero)) {
+            res += (int)y;
+        }
+        return res;
+    }
+};
+
+template <class T>
+struct FModFunctor {
+    struct Params {
+        CUDA4DNN_HOST_DEVICE Params() {}
+    };
+
+    CUDA4DNN_DEVICE FModFunctor() { }
+    CUDA4DNN_DEVICE FModFunctor(const Params& params) { }
+
+    CUDA4DNN_DEVICE T operator()(T x, T y) {
+        using csl::device::fmod;
+        return fmod(x, y);
+    }
+};
+
 }}}} /* namespace cv::dnn::cuda4dnn::kernels */
 
 #endif /* OPENCV_DNN_SRC_CUDA_FUNCTORS_HPP */
diff --git a/modules/dnn/src/cuda/math.hpp b/modules/dnn/src/cuda/math.hpp
index 0a312a250d..8e4f091f4f 100644
--- a/modules/dnn/src/cuda/math.hpp
+++ b/modules/dnn/src/cuda/math.hpp
@@ -36,6 +36,13 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace de
     template <> inline __device__ float min(float x, float y) { return fminf(x, y); }
     template <> inline __device__ double min(double x, double y) { return fmin(x, y); }
 
+    template <class T> __device__ T fmod(T x, T y) { return x % y; }
+    template <> inline __device__ float fmod(float x, float y) { return fmodf(x, y); }
+    template <> inline __device__ double fmod(double x, double y) { return fmod(x, y); }
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template <> inline __device__ half fmod(half x, half y) { return fmodf((float)x, (float)y); }
+#endif
+
     template <class T> __device__ T log1p(T val);
 #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
     template <> inline __device__ __half log1p(__half val) { return hlog(__half(1) + val); }
diff --git a/modules/dnn/src/cuda/mvn.cu b/modules/dnn/src/cuda/mvn.cu
index 0accc499a2..d6db7c4fb4 100644
--- a/modules/dnn/src/cuda/mvn.cu
+++ b/modules/dnn/src/cuda/mvn.cu
@@ -78,6 +78,18 @@ namespace raw {
         }
     }
 
+    template <class T>
+    __global__ void normalize_mean_variance_groupwise(Span<T> output, View<T> input, View<T> scale, View<T> bias, View<float> means, View<float> inv_stddev, size_type inner_size, size_type C, size_type num_groups, size_type group_size) {
+        for (auto idx : grid_stride_range(output.size())) {
+            const index_type outer_idx = idx / inner_size;
+            const index_type c = outer_idx % C;
+            const index_type group_idx = outer_idx / group_size;
+            auto s = static_cast<float>(scale[c]) * inv_stddev[group_idx];
+            auto b = static_cast<float>(bias[c]);
+            output[idx] = (static_cast<float>(input[idx]) - means[group_idx]) * s + b;
+        }
+    }
+
     template <class T>
     __global__ void normalize_mean_variance_layernorm(Span<T> output, View<T> input, View<T> scale, View<float> means, View<float> inv_stddev, size_type inner_size) {
         for (auto idx : grid_stride_range(output.size())) {
@@ -191,6 +203,24 @@ template void normalize_mean_variance_channelwise(const Stream&, Span<__half> /*
 #endif
 template void normalize_mean_variance_channelwise(const Stream&, Span<float> /*output*/, View<float> /*input*/, View<float> /*scale*/, View<float> /*bias*/, View<float> /*means*/, View<float> /*inv_stddev*/, std::size_t, std::size_t);
 
+template <class T>
+void normalize_mean_variance_groupwise(const Stream& stream, Span<T> output, View<T> input, View<T> scale, View<T> bias, View<float> means, View<float> inv_stddev, std::size_t inner_size, std::size_t C, std::size_t num_groups, std::size_t group_size)
+{
+    CV_Assert(input.size() == output.size());
+    CV_Assert(input.size() / inner_size == means.size() * group_size);
+    CV_Assert(means.size() == inv_stddev.size());
+
+    auto kernel = raw::normalize_mean_variance_groupwise<T>;
+    auto policy = make_policy(kernel, output.size(), 0, stream);
+    launch_kernel(kernel, policy, output, input, scale, bias, means, inv_stddev, inner_size, C, num_groups, group_size);
+}
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+template void normalize_mean_variance_groupwise(const Stream&, Span<__half> /*output*/, View<__half> /*input*/, View<__half> /*scale*/, View<__half> /*bias*/, View<float> /*means*/, View<float> /*inv_stddev*/, std::size_t, std::size_t, std::size_t, std::size_t);
+#endif
+template void normalize_mean_variance_groupwise(const Stream&, Span<float> /*output*/, View<float> /*input*/, View<float> /*scale*/, View<float> /*bias*/, View<float> /*means*/, View<float> /*inv_stddev*/, std::size_t, std::size_t, std::size_t, std::size_t);
+
+
 template <class T>
 void normalize_mean_variance_layernorm(const Stream& stream, Span<T> output, View<T> input, View<T> scale, View<float> means, View<float> inv_stddev, std::size_t inner_size)
 {
diff --git a/modules/dnn/src/cuda4dnn/csl/tensor.hpp b/modules/dnn/src/cuda4dnn/csl/tensor.hpp
index f113a55689..1745c60906 100644
--- a/modules/dnn/src/cuda4dnn/csl/tensor.hpp
+++ b/modules/dnn/src/cuda4dnn/csl/tensor.hpp
@@ -1262,6 +1262,23 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
         return true;
     }
 
+    template <typename ShapeType>
+    bool is_shape_compatible1(const ShapeType &x_shape, const ShapeType &y_shape) noexcept {
+        const auto x_ndims = x_shape.size(), y_ndims = y_shape.size();
+
+        if (x_ndims != y_ndims) {
+            return false;
+        }
+
+        for (int i = 0; i < x_ndims; i++) {
+            if (x_shape[i] != y_shape[i] && x_shape[i] != 1 && y_shape[i] != 1) {
+                 return false;
+            }
+        }
+
+        return true;
+    }
+
     /** returns the rank to which the given tensor can be squeezed to */
     template <class TensorType>
     std::size_t get_effective_rank(const TensorType& x) noexcept {
diff --git a/modules/dnn/src/cuda4dnn/kernels/eltwise_ops.hpp b/modules/dnn/src/cuda4dnn/kernels/eltwise_ops.hpp
index 3dc3355b3b..e80db943ae 100644
--- a/modules/dnn/src/cuda4dnn/kernels/eltwise_ops.hpp
+++ b/modules/dnn/src/cuda4dnn/kernels/eltwise_ops.hpp
@@ -33,6 +33,12 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
     template <class T>
     void eltwise_sub_2(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> x, csl::TensorView<T> y);
 
+    template <class T>
+    void eltwise_mod_2(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> x, csl::TensorView<T> y);
+
+    template <class T>
+    void eltwise_fmod_2(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> x, csl::TensorView<T> y);
+
 }}}} /* namespace cv::dnn::cuda4dnn::kernels */
 
 #endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_ELTWISE_OPS_HPP */
diff --git a/modules/dnn/src/cuda4dnn/kernels/mvn.hpp b/modules/dnn/src/cuda4dnn/kernels/mvn.hpp
index 6cddeb22bb..a09dafb76d 100644
--- a/modules/dnn/src/cuda4dnn/kernels/mvn.hpp
+++ b/modules/dnn/src/cuda4dnn/kernels/mvn.hpp
@@ -35,6 +35,10 @@ void normalize_mean_variance_layernorm(const csl::Stream &stream, csl::Span<T> o
 template <class T>
 void normalize_mean_variance_layernorm(const csl::Stream &stream, csl::Span<T> output, csl::View<T> input, csl::View<T> scale, csl::View<T> bias, csl::View<float> means, csl::View<float> inv_stddev, std::size_t inner_size);
 
+template <class T>
+void normalize_mean_variance_groupwise(const csl::Stream &stream, csl::Span<T> output, csl::View<T> input, csl::View<T> scale, csl::View<T> bias, csl::View<float> means, csl::View<float> inv_stddev, std::size_t inner_size, std::size_t C, std::size_t num_groups, std::size_t group_size);
+
+
 }}}} /* namespace cv::dnn::cuda4dnn::kernels */
 
 #endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_MVN_HPP */
diff --git a/modules/dnn/src/cuda4dnn/primitives/eltwise.hpp b/modules/dnn/src/cuda4dnn/primitives/eltwise.hpp
index 05bca83820..5822f48061 100644
--- a/modules/dnn/src/cuda4dnn/primitives/eltwise.hpp
+++ b/modules/dnn/src/cuda4dnn/primitives/eltwise.hpp
@@ -28,6 +28,8 @@ namespace cv { namespace dnn { namespace cuda4dnn {
         DIV,
         MIN,
         SUB,
+        MOD,
+        FMOD,
     };
 
     class EltwiseOpBase : public CUDABackendNode {
@@ -90,6 +92,8 @@ namespace cv { namespace dnn { namespace cuda4dnn {
                         kernels::eltwise_sum_coeff_2<T>(stream, output, coeffs[0], input_x, coeffs[1], input_y);
                     break;
                 case EltwiseOpType::SUB: kernels::eltwise_sub_2<T>(stream, output, input_x, input_y); break;
+                case EltwiseOpType::MOD: kernels::eltwise_mod_2<T>(stream, output, input_x, input_y); break;
+                case EltwiseOpType::FMOD: kernels::eltwise_fmod_2<T>(stream, output, input_x, input_y); break;
                 }
             }
             else
@@ -122,6 +126,8 @@ namespace cv { namespace dnn { namespace cuda4dnn {
                         }
                         break;
                     case EltwiseOpType::SUB: kernels::eltwise_sub_2<T>(stream, output, output, input); break;
+                    case EltwiseOpType::MOD: kernels::eltwise_mod_2<T>(stream, output, output, input); break;
+                    case EltwiseOpType::FMOD: kernels::eltwise_fmod_2<T>(stream, output, output, input); break;
                     }
                 }
             }
diff --git a/modules/dnn/src/cuda4dnn/primitives/group_norm.hpp b/modules/dnn/src/cuda4dnn/primitives/group_norm.hpp
new file mode 100644
index 0000000000..bb3e162a33
--- /dev/null
+++ b/modules/dnn/src/cuda4dnn/primitives/group_norm.hpp
@@ -0,0 +1,87 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_GROUP_NORM_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_GROUP_NORM_HPP
+
+#include "../../op_cuda.hpp"
+
+#include "../csl/stream.hpp"
+#include "../csl/span.hpp"
+#include "../csl/tensor.hpp"
+#include "../csl/workspace.hpp"
+
+#include "../kernels/fill_copy.hpp"
+#include "../kernels/mvn.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cstddef>
+#include <vector>
+#include <utility>
+
+namespace cv { namespace dnn { namespace cuda4dnn {
+
+    template <class T>
+    class GroupNormOp final : public CUDABackendNode {
+    public:
+        using wrapper_type = GetCUDABackendWrapperType<T>;
+
+        GroupNormOp(csl::Stream stream_, float epsilon_, size_t loops, size_t num_groups)
+            : stream(std::move(stream_)), epsilon(epsilon_), num_groups(num_groups) {
+            csl::WorkspaceBuilder builder;
+            builder.require<float>(loops * num_groups); // mean and stdev for each group
+            builder.require<float>(loops * num_groups);
+            scratch_mem_in_bytes = builder.required_workspace_size();
+        }
+
+        void forward(const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+                     const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+                     csl::Workspace& workspace) override {
+            auto input_wrapper = inputs[0].dynamicCast<wrapper_type>();
+            auto scale_wrapper = inputs[1].dynamicCast<wrapper_type>();
+            auto bias_wrapper = inputs[2].dynamicCast<wrapper_type>();
+
+            auto input = input_wrapper->getView();
+            auto scale = scale_wrapper->getView();
+            auto bias = bias_wrapper->getView();
+
+            auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
+            auto output = output_wrapper->getSpan();
+
+            auto C = input.get_axis_size(1);
+            auto loops = input.size_range(0, 2);
+            auto norm_size = input.size_range(2, input.rank());
+            auto num_groups = this->num_groups;
+            auto group_size = C / num_groups;
+            if (norm_size == 1) {
+                kernels::fill<T>(stream, output, 0.f);
+                return;
+            } else {
+                auto ws_allocator = csl::WorkspaceAllocator(workspace);
+
+                auto mean = ws_allocator.get_span<float>(loops / group_size);
+                kernels::fill<float>(stream, mean, 0.f);
+
+                auto stdev = ws_allocator.get_span<float>(loops / group_size);
+                kernels::fill<float>(stream, stdev, 0.f);
+
+                kernels::reduce_mean_sqr_sum<T>(stream, mean, stdev, input, norm_size * group_size);
+                kernels::compute_normalization_scale(stream, stdev, mean, stdev, norm_size * group_size, epsilon);
+                kernels::normalize_mean_variance_groupwise<T>(stream, output, input, scale, bias, mean, stdev, norm_size, C, num_groups, group_size);
+            }
+        }
+
+        std::size_t get_workspace_memory_in_bytes() const noexcept override { return scratch_mem_in_bytes; }
+
+    private:
+        csl::Stream stream;
+        float epsilon;
+        std::size_t num_groups;
+        std::size_t scratch_mem_in_bytes;
+    };
+
+}}} // cv::dnn::cuda4dnn
+
+#endif // OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_GROUP_NORM_HPP
diff --git a/modules/dnn/src/init.cpp b/modules/dnn/src/init.cpp
index 9b433dac50..2170aafc4b 100644
--- a/modules/dnn/src/init.cpp
+++ b/modules/dnn/src/init.cpp
@@ -163,6 +163,7 @@ void initializeLayerFactory()
     CV_DNN_REGISTER_LAYER_CLASS(Expand,         ExpandLayer);
     CV_DNN_REGISTER_LAYER_CLASS(InstanceNormalization, InstanceNormLayer);
     CV_DNN_REGISTER_LAYER_CLASS(Attention,      AttentionLayer);
+    CV_DNN_REGISTER_LAYER_CLASS(GroupNormalization, GroupNormLayer);
 
     CV_DNN_REGISTER_LAYER_CLASS(Crop,           CropLayer);
     CV_DNN_REGISTER_LAYER_CLASS(Eltwise,        EltwiseLayer);
diff --git a/modules/dnn/src/int8layers/convolution_layer.cpp b/modules/dnn/src/int8layers/convolution_layer.cpp
index ba9b31fe35..6121e971a2 100644
--- a/modules/dnn/src/int8layers/convolution_layer.cpp
+++ b/modules/dnn/src/int8layers/convolution_layer.cpp
@@ -969,6 +969,13 @@ public:
                                     stride_h, stride_w, dilation_h, dilation_w, pad_t, pad_l,
                                     biasptr, multptr, inptr_, height, width, outptr_, out_d, outH, outW, inpZp, outZp);
                             else
+                        #endif
+                        #if CV_RVP052
+                            if(isConv2D)
+                                opt_RVP052::fastDepthwiseConv(wptr, kernel_h, kernel_w,
+                                    stride_h, stride_w, dilation_h, dilation_w, pad_t, pad_l,
+                                    biasptr, multptr, inptr_, height, width, outptr_, out_d, outH, outW, inpZp, outZp);
+                            else
                         #endif
                             {
                                 const int8_t w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],
@@ -1348,6 +1355,12 @@ public:
                             opt_LASX::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0,
                                           outShape, bsz, vsz, vsz_a, outZp, multptr, cn0 == 0, cn1 == inpCn);
                         else
+                    #endif
+                    #if CV_RVP052
+                        if(isConv2D)
+                            opt_RVP052::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0,
+                                          outShape, bsz, vsz, vsz_a, outZp, multptr, cn0 == 0, cn1 == inpCn);
+                        else
                     #endif
                         for( int i = 0; i < outCn; i += 2 )
                         {
diff --git a/modules/dnn/src/int8layers/fully_connected_layer.cpp b/modules/dnn/src/int8layers/fully_connected_layer.cpp
index ba5b0d79c1..3a560ddda6 100644
--- a/modules/dnn/src/int8layers/fully_connected_layer.cpp
+++ b/modules/dnn/src/int8layers/fully_connected_layer.cpp
@@ -302,6 +302,11 @@ public:
                 if( useLASX )
                     opt_LASX::fastGEMM1T( sptr, wptr, wstep, biasptr, multptr, dptr, nw, vecsize, outZp );
                 else
+            #endif
+            #if CV_RVP052
+                if( 1 )
+                    opt_RVP052::fastGEMM1T( sptr, wptr, wstep, biasptr, multptr, dptr, nw, vecsize, outZp );
+                else
             #endif
                 {
                     int i = 0;
diff --git a/modules/dnn/src/int8layers/layers_common.hpp b/modules/dnn/src/int8layers/layers_common.hpp
index 5fdafbeab8..4612feed48 100644
--- a/modules/dnn/src/int8layers/layers_common.hpp
+++ b/modules/dnn/src/int8layers/layers_common.hpp
@@ -13,6 +13,8 @@
 #include "int8layers/layers_common.simd_declarations.hpp"
 #undef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
 
+#include "./layers_rvp052.hpp"
+
 #ifdef HAVE_OPENCL
 #include "../ocl4dnn/include/ocl4dnn.hpp"
 #endif
diff --git a/modules/dnn/src/int8layers/layers_rvp052.cpp b/modules/dnn/src/int8layers/layers_rvp052.cpp
new file mode 100644
index 0000000000..628882a43f
--- /dev/null
+++ b/modules/dnn/src/int8layers/layers_rvp052.cpp
@@ -0,0 +1,221 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "../precomp.hpp"
+#include "./layers_rvp052.hpp"
+
+#if CV_RVP052
+
+namespace cv {
+namespace dnn {
+namespace opt_RVP052 {
+
+void fastConv(const int8_t *weights, size_t wstep, const int *bias,
+              const int8_t *rowbuf, int *output, const int *outShape,
+              int blockSize, int vecsize, int vecsize_aligned, int outZp,
+              const float *multiplier, bool initOutput, bool finalOutput)
+{
+    int outCn = outShape[1];
+    size_t outPlaneSize = outShape[2] * outShape[3];
+    for (int i = 0; i < outCn; i += 2)
+    {
+        const int8_t *wptr0 = weights + i * wstep;
+        const int8_t *wptr1 = wptr0 + wstep;
+        int *outptr0 = output + i * outPlaneSize;
+        int *outptr1 = outptr0 + outPlaneSize;
+        int bias0 = bias[i], bias1 = bias[i + 1];
+        float mult0 = multiplier[i], mult1 = multiplier[i + 1];
+
+        if (i + 1 >= outCn)
+        {
+            wptr1 = wptr0;
+            outptr1 = outptr0;
+            bias1 = bias0;
+            mult1 = mult0;
+        }
+        int j = 0;
+        for (; j < blockSize; j++)
+        {
+            const int8_t *rptr = rowbuf + j * vecsize_aligned;
+            int s00 = initOutput ? bias0 : outptr0[j];
+            int s10 = initOutput ? bias1 : outptr1[j];
+
+            int32x2_t vsx0 = {s00, s10};
+
+            for (int k = 0; k < vecsize; k += 4)
+            {
+                int8x4_t vrptr[2] = {*(int8x4_t*)(rptr + k), *(int8x4_t*)(rptr + k)};
+                int8x4_t vwptr[2] = {*(int8x4_t*)(wptr0 + k), *(int8x4_t*)(wptr1 + k)};
+                vsx0 = __nds__v_smaqa(vsx0, *(int8x8_t*)vwptr, *(int8x8_t*)vrptr);
+            }
+
+            if (finalOutput)
+            {
+                vsx0[0] = outZp + (int)std::round(vsx0[0] * mult0);
+                vsx0[1] = outZp + (int)std::round(vsx0[1] * mult1);
+                vsx0 = __nds__v_sclip32(vsx0, 7);
+            }
+
+            outptr0[j] = vsx0[0];
+            outptr1[j] = vsx0[1];
+        }
+    }
+}
+
+void fastDepthwiseConv(const int8_t *wptr,
+                       int kernel_h, int kernel_w,
+                       int stride_h, int stride_w,
+                       int dilation_h, int dilation_w,
+                       int pad_t, int pad_l,
+                       const int *biasptr, const float *multptr,
+                       const int8_t *inptr_,
+                       int height, int width,
+                       int *outptr_,
+                       int out_d, int outH, int outW,
+                       int inpZp, int outZp)
+{
+    const int8_t w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],
+                 w10 = wptr[3], w11 = wptr[4], w12 = wptr[5],
+                 w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8];
+    int outW1 = min(outW, (width - dilation_w * (kernel_w - 1) + pad_l) / stride_w);
+    int bias = biasptr[out_d], biasCopy;
+    float mult = multptr[out_d];
+
+    for (int out_i = 0; out_i < outH; out_i++)
+    {
+        int in_i = out_i * stride_h - pad_t, out_j = 0;
+        const int8_t *imgptr0 = inptr_ + in_i * width;
+        const int8_t *imgptr1 = imgptr0 + dilation_h * width;
+        const int8_t *imgptr2 = imgptr0 + (dilation_h * 2) * width;
+        int8_t w00 = w00_, w01 = w01_, w02 = w02_;
+        int8_t w20 = w20_, w21 = w21_, w22 = w22_;
+        int out;
+        biasCopy = bias;
+
+        if (in_i < 0)
+        {
+            biasCopy += inpZp * (w00 + w01 + w02);
+            w00 = w01 = w02 = 0;
+            imgptr0 = imgptr1;
+        }
+        else if (in_i + dilation_h * (kernel_h - 1) >= height)
+        {
+            biasCopy += inpZp * (w20 + w21 + w22);
+            w20 = w21 = w22 = 0;
+            imgptr2 = imgptr1;
+        }
+        int *outptr = outptr_ + out_i * outW;
+        if (pad_l > 0)
+        {
+            out = (int)imgptr0[0] * w01 + (int)imgptr0[dilation_w] * w02 +
+                  (int)imgptr1[0] * w11 + (int)imgptr1[dilation_w] * w12 +
+                  (int)imgptr2[0] * w21 + (int)imgptr2[dilation_w] * w22 +
+                  biasCopy + inpZp * (w00 + w10 + w20);
+            outptr[0] = __nds__sclip32(outZp + (int)std::round(out * mult), 7);
+            out_j = 1;
+        }
+
+        int8x8_t vwx0 = (int8x8_t){w00, w10, w20, 0, w00, w10, w20, 0};
+        int8x8_t vwx1 = (int8x8_t){w01, w11, w21, 0, w01, w11, w21, 0};
+        int8x8_t vwx2 = (int8x8_t){w02, w12, w22, 0, w02, w12, w22, 0};
+        int8x8_t vimgx0, vimgx1, vimgx2;
+        int32x2_t vout = {0, 0};
+        for (; out_j < outW1; out_j+=2)
+        {
+            int in_j = out_j * stride_w - pad_l;
+            vimgx0 = (int8x8_t){imgptr0[in_j], imgptr1[in_j], imgptr2[in_j], 0,
+                                imgptr0[in_j + stride_w], imgptr1[in_j + stride_w], imgptr2[in_j + stride_w], 0};
+            vimgx1 = (int8x8_t){imgptr0[in_j + dilation_w], imgptr1[in_j + dilation_w], imgptr2[in_j + dilation_w], 0,
+                                imgptr0[in_j + dilation_w + stride_w], imgptr1[in_j + dilation_w + stride_w], imgptr2[in_j + dilation_w + stride_w], 0};
+            vimgx2 = (int8x8_t){imgptr0[in_j + dilation_w * 2], imgptr1[in_j + dilation_w * 2], imgptr2[in_j + dilation_w * 2], 0,
+                                imgptr0[in_j + dilation_w * 2 + stride_w], imgptr1[in_j + dilation_w * 2 + stride_w], imgptr2[in_j + dilation_w * 2 + stride_w], 0};
+
+            vout = (int32x2_t){biasCopy, biasCopy};
+            vout = __nds__v_smaqa(vout, vwx0, vimgx0);
+            vout = __nds__v_smaqa(vout, vwx1, vimgx1);
+            vout = __nds__v_smaqa(vout, vwx2, vimgx2);
+
+            outptr[out_j] = __nds__sclip32(outZp + (int)std::round(vout[0] * mult), 7);
+            outptr[out_j + 1] = __nds__sclip32(outZp + (int)std::round(vout[1] * mult), 7);
+        }
+
+        while (out_j > outW1) out_j--;
+
+        for (; out_j < outW; out_j++)
+        {
+            int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w * 2;
+            int s0 = 1, s1 = 1, s2 = 1;
+            if (in_j0 >= width)
+            {
+                in_j0 = 0;
+                s0 = 0;
+                biasCopy += inpZp * (w00 + w10 + w20);
+            }
+            if (in_j1 >= width)
+            {
+                in_j1 = 0;
+                s1 = 0;
+                biasCopy += inpZp * (w01 + w11 + w21);
+            }
+            if (in_j2 >= width)
+            {
+                in_j2 = 0;
+                s2 = 0;
+                biasCopy += inpZp * (w02 + w12 + w22);
+            }
+            out = (int)imgptr0[in_j0] * w00 * s0 + (int)imgptr0[in_j1] * w01 * s1 + (int)imgptr0[in_j2] * w02 * s2 +
+                  (int)imgptr1[in_j0] * w10 * s0 + (int)imgptr1[in_j1] * w11 * s1 + (int)imgptr1[in_j2] * w12 * s2 +
+                  (int)imgptr2[in_j0] * w20 * s0 + (int)imgptr2[in_j1] * w21 * s1 + (int)imgptr2[in_j2] * w22 * s2 + biasCopy;
+            outptr[out_j] = __nds__sclip32(outZp + (int)std::round(out * mult), 7);
+        }
+    }
+}
+
+// dst = vec * weights^t + bias
+void fastGEMM1T( const int8_t* vec, const int8_t* weights,
+                 size_t wstep, const int* bias, const float* multiplier,
+                 int* dst, int nvecs, int vecsize, int outZp )
+{
+    int i = 0;
+
+    for( ; i <= nvecs - 2; i += 2 )
+    {
+        const int8_t* wptr0 = weights + i * wstep;
+        const int8_t* wptr1 = weights + (i + 1) * wstep;
+
+        int32x2_t vs0 = *(int32x2_t*)(bias + i);
+
+        for( int k = 0; k < vecsize; k += 4 )
+        {
+            int8x4_t vvec[2] = {*(int8x4_t*)(vec + k), *(int8x4_t*)(vec + k)};
+            int8x4_t vwptr[2] = {*(int8x4_t*)(wptr0 + k), *(int8x4_t*)(wptr1 + k)};
+            vs0 = __nds__v_smaqa(vs0, *(int8x8_t*)vwptr, *(int8x8_t*)vvec);
+        }
+
+        int32x2_t vdst = {(int)std::round(vs0[0] * multiplier[i]), (int)std::round(vs0[1] * multiplier[i + 1])};
+
+        vdst = __nds__v_sclip32(vdst + outZp, 7);
+
+        *(int32x2_t*)(dst + i) = vdst;
+    }
+
+    for( ; i < nvecs; i++ )
+    {
+        const int8_t* wptr = weights + i * wstep;
+        int s0 = bias[i];
+
+        for( int k = 0; k < vecsize; k += 4 )
+        {
+            int8x4_t vvec[2] = {*(int8x4_t*)(vec + k), 0};
+            int8x4_t vwptr[2] = {*(int8x4_t*)(wptr + k), 0};
+            s0 = __nds__smaqa(s0, *(unsigned long*)vwptr, *(unsigned long*)vvec);
+        }
+
+        dst[i] = __nds__sclip32(outZp + (int)std::round(s0 * multiplier[i]), 7);
+    }
+}
+
+}}} // namespace
+
+#endif
diff --git a/modules/dnn/src/int8layers/layers_rvp052.hpp b/modules/dnn/src/int8layers/layers_rvp052.hpp
new file mode 100644
index 0000000000..c956caf20c
--- /dev/null
+++ b/modules/dnn/src/int8layers/layers_rvp052.hpp
@@ -0,0 +1,36 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#if defined(__riscv) && defined(__riscv_dsp) && defined(__ANDES)
+# include <nds_intrinsic.h>
+# define CV_RVP052 1
+
+namespace cv {
+namespace dnn {
+namespace opt_RVP052 {
+
+void fastConv( const int8_t* weights, size_t wstep, const int* bias,
+               const int8_t* rowbuf, int* output, const int* outShape,
+               int blockSize, int vecsize, int vecsize_aligned, int outZp,
+               const float* multiplier, bool initOutput, bool finalOutput );
+void fastDepthwiseConv( const int8_t* wptr,
+                        int kernel_h, int kernel_w,
+                        int stride_h, int stride_w,
+                        int dilation_h, int dilation_w,
+                        int pad_t, int pad_l,
+                        const int* biasptr, const float* multptr,
+                        const int8_t* inptr_,
+                        int height, int width,
+                        int* outptr_,
+                        int out_d, int outH, int outW,
+                        int inpZp, int outZp );
+void fastGEMM1T( const int8_t* vec, const int8_t* weights,
+                 size_t wstep, const int* bias, const float* multiplier,
+                 int* dst, int nvecs, int vecsize, int outZp );
+
+}}}
+
+#else
+# define CV_RVP052 0
+#endif
diff --git a/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.cpp b/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.cpp
index 6cf066576b..d19cec64de 100644
--- a/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.cpp
+++ b/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.cpp
@@ -338,7 +338,7 @@ int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _outpu
                         }
 #if CV_TRY_AVX2
                         if (conv->useAVX2)
-                            opt_AVX::winofunc_AtXA_8x8_F32((float *)out_wbuf + ((k - k0)*CONV_WINO_IBLOCK + (block_id - block_id0))*CONV_WINO_AREA, CONV_WINO_SIZE,
+                            opt_AVX2::winofunc_AtXA_8x8_F32((float *)out_wbuf + ((k - k0)*CONV_WINO_IBLOCK + (block_id - block_id0))*CONV_WINO_AREA, CONV_WINO_SIZE,
                                                                 bpptr, outstep, outptr, outstep, biasv, minval, maxval, ifMinMaxAct);
                         else
 #endif
diff --git a/modules/dnn/src/layers/cpu_kernels/fast_gemm.cpp b/modules/dnn/src/layers/cpu_kernels/fast_gemm.cpp
index a8972aba4e..f8fe2bb40e 100644
--- a/modules/dnn/src/layers/cpu_kernels/fast_gemm.cpp
+++ b/modules/dnn/src/layers/cpu_kernels/fast_gemm.cpp
@@ -385,7 +385,7 @@ void fastGemmBatch(bool trans_a, bool trans_b,
     const auto shape_b = shape(B);
     const auto shape_c = shape(C);
     CV_CheckGE(shape_a.size(), static_cast<size_t>(2), "DNN/fastGemmBatch: A must be n-dimensional (n >= 2)");
-    CV_CheckEQ(shape_b.size(), static_cast<size_t>(2), "DNN/fastGemmBatch: B must be n-dimensional (n >= 2)");
+    CV_CheckGE(shape_b.size(), static_cast<size_t>(2), "DNN/fastGemmBatch: B must be n-dimensional (n >= 2)");
 
     const float *a = A.ptr<const float>();
     const float *b = B.ptr<const float>();
diff --git a/modules/dnn/src/layers/cpu_kernels/fast_norm.cpp b/modules/dnn/src/layers/cpu_kernels/fast_norm.cpp
index ab9d8ee0af..35f354ed29 100644
--- a/modules/dnn/src/layers/cpu_kernels/fast_norm.cpp
+++ b/modules/dnn/src/layers/cpu_kernels/fast_norm.cpp
@@ -158,4 +158,51 @@ void fastNormChannel(const Mat &input, const Mat &scale, const Mat &bias, Mat &o
     parallel_for_(Range(0, loops), fn, nstripes);
 }
 
+void fastNormGroup(const Mat &input, const Mat &scale, const Mat &bias, Mat &output, float epsilon, size_t num_groups) {
+    const auto input_shape = shape(input);
+    size_t N = input_shape[0], C = input_shape[1];
+    CV_CheckEQ(scale.total(), bias.total(), "fastNormGroup: scale and bias should have the same shape");
+    CV_CheckEQ(scale.total(), C, "fastNormGroup: scale should be a 1d tensor and match the channel of input");
+    CV_CheckGE(input.dims, 3, "fastNormGroup: input dimension >= 3");
+
+    size_t channels_per_group = C / num_groups;
+    size_t loops = N * num_groups;
+    size_t norm_size = static_cast<size_t>(total(input_shape, 2) * channels_per_group);
+    size_t step = norm_size / channels_per_group;
+    float inv_norm_size = 1.0 / norm_size;
+
+    auto fn = [&](const Range &r) {
+        const auto *input_data = input.ptr<const float>();
+        const auto *scale_data = scale.ptr<const float>();
+        const auto *bias_data = bias.ptr<const float>();
+        auto *output_data = output.ptr<float>();
+
+        for (int i = r.start; i < r.end; i++) {
+            const auto *x = input_data + norm_size * i;
+            auto *y = output_data + norm_size * i;
+
+            float mean = 0.f, mean_square = 0.f;
+            for (int j = 0; j < norm_size; j++) {
+                float v = x[j];
+                mean += v;
+                mean_square += v * v;
+            }
+
+            mean *= inv_norm_size;
+            mean_square = std::sqrt(std::max(0.f, mean_square * inv_norm_size - mean * mean) + epsilon);
+            float inv_stdev = 1.f / mean_square;
+
+            size_t group_idx = i % num_groups * channels_per_group;
+            for (size_t j = 0; j < norm_size; j++) {
+                size_t c = group_idx + (j / step);
+                float s = scale_data[c] * inv_stdev, b = bias_data[c];
+                y[j] = s * (x[j] - mean) + b;
+            }
+        }
+    };
+
+    double nstripes = loops * norm_size * (1 / 1024.0);
+    parallel_for_(Range(0, loops), fn, nstripes);
+}
+
 }} // cv::dnn
diff --git a/modules/dnn/src/layers/cpu_kernels/fast_norm.hpp b/modules/dnn/src/layers/cpu_kernels/fast_norm.hpp
index 61316542d3..72cbdad0a7 100644
--- a/modules/dnn/src/layers/cpu_kernels/fast_norm.hpp
+++ b/modules/dnn/src/layers/cpu_kernels/fast_norm.hpp
@@ -21,6 +21,9 @@ void fastNorm(const Mat &input, const Mat &scale, const Mat &bias, Mat &output,
 // Channel-wise Normalization speedup by multi-threading. Scale and bias should have the same shape (C). Input should have dimension >= 3.
 void fastNormChannel(const Mat &input, const Mat &scale, const Mat &bias, Mat &output, float epsilon);
 
+// Group-wise Normalization speedup by multi-threading. Scale and bias should have the same shape (C). Input should have dimension >= 3.
+void fastNormGroup(const Mat &input, const Mat &scale, const Mat &bias, Mat &output, float epsilon, size_t num_groups);
+
 }} // cv::dnn
 
 #endif // OPENCV_DNN_FAST_NORM_HPP
diff --git a/modules/dnn/src/layers/einsum_layer.cpp b/modules/dnn/src/layers/einsum_layer.cpp
index 172ba47478..f4b2482c85 100644
--- a/modules/dnn/src/layers/einsum_layer.cpp
+++ b/modules/dnn/src/layers/einsum_layer.cpp
@@ -1299,7 +1299,6 @@ Mat LayerEinsumImpl::batchwiseMatMul(
     const Mat& input2,
     const MatShape& input2ShapeOverride)
 {
-
     // Sanity checks before the actual MatMul
     CV_CheckType(input1.type(), input2.type(), "Data types of the inputs must match for MatMul");
     CV_CheckEQ(input1ShapeOverride.size(), (size_t) 3, "Only 1 batch dimension is allowed for MatMul");
@@ -1312,59 +1311,21 @@ Mat LayerEinsumImpl::batchwiseMatMul(
     int K = input1ShapeOverride[2];
     int N = input2ShapeOverride[2];
 
-    std::vector<Mat> output;
+    Mat reshapedInput1 = input1;
+    Mat reshapedInput2 = input2;
+
+    Mat output;
     if (batches > 1)
     {
-        Mat reshapedInput1 = input1;
-        Mat reshapedInput2 = input2;
+        // create tmpout with type like input1
+        output = Mat({batches, M, N}, input1.type());
 
-        // input1 should of size MxK
-        // check if input1 needs reshape, if need reshape
-        if (input1.size[0] != M || input1.size[1] != K)
-        {
-            int shape[] = {batches, M, K};
-            reshapedInput1 = input1.reshape(1, 3, shape);
-        }
-
-        // input2 should be of size KxN
-        // check if input2 needs reshape, if needs reshape
-        if (input2.size[0] != K || input2.size[1] != N)
-        {
-            int shape[] = {batches, K, N};
-            reshapedInput2 = input2.reshape(1, 3, shape);
-        }
-
-        for (size_t i=0; i < batches; i++)
-        {
-            std::vector<Range> ranges1 = {cv::Range(i, i+1)};
-            for (int j = 1; j < reshapedInput1.dims; j++)
-                ranges1.emplace_back(cv::Range::all());
-
-            Mat part1 = reshapedInput1(ranges1);
-            int shape[] = {M, K};
-            part1 = part1.reshape(1, sizeof(shape)/sizeof(shape[0]), shape);
-
-            std::vector<Range> ranges2 = {cv::Range(i, i+1)};
-            for (int j = 1; j < reshapedInput2.dims; j++)
-                ranges2.emplace_back(cv::Range::all());
-
-            Mat part2 = reshapedInput2(ranges2);
-            int shape2[] = {K, N};
-            part2 = part2.reshape(1, sizeof(shape2)/sizeof(shape2[0]), shape2);
-
-            Mat tmp_output(M, N, part1.type());
-            fastGemm(false, false, 1.0, part1, part2, 0.0, tmp_output, opt);
-            int newShape[] = {1, M, N};
-            tmp_output = tmp_output.reshape(1, sizeof(newShape)/sizeof(newShape[0]), newShape);
-
-            output.emplace_back(tmp_output);
-        }
+        reshapedInput2 = reshapedInput2.reshape(1, input2ShapeOverride);
+        reshapedInput1 = reshapedInput1.reshape(1, input1ShapeOverride);
 
+        fastGemmBatch(false, false, 1.0, reshapedInput1, reshapedInput2, 0.0, output, opt);
     } else {
 
-        Mat reshapedInput1 = input1;
-        Mat reshapedInput2 = input2;
-
         // input1 should of size MxK
         // check if input1 needs reshape, if need reshape
         if (input1.dims > 2 || input1.size[0] != M || (input1.dims > 1 && input1.size[1] != K) || input1.dims == 1)
@@ -1381,23 +1342,12 @@ Mat LayerEinsumImpl::batchwiseMatMul(
             reshapedInput2 = input2.reshape(1, 2, shape2);
         }
 
-        Mat tmp_output(M, N, reshapedInput1.type());
-        fastGemm(false, false, 1.0, reshapedInput1, reshapedInput2, 0.0, tmp_output, opt);
-
-        int newShape[] = {1, M, N};
-        tmp_output = tmp_output.reshape(1, sizeof(newShape)/sizeof(newShape[0]), newShape);
-        output.emplace_back(tmp_output);
+        output = Mat(M, N, reshapedInput1.type());
+        fastGemm(false, false, 1.0, reshapedInput1, reshapedInput2, 0.0, output, opt);
 
+        output = output.reshape(1, {1, M, N});
     }
-
-    int outputDim[] = {static_cast<int>(output.size()), M, N};
-    Mat output_buffer = Mat::zeros(3, outputDim, CV_32F);
-
-    for (size_t i = 0; i < output.size(); i++) {
-        Mat output_slice = output_buffer.row(i);
-        output[i].copyTo(output_slice);
-    }
-    return output_buffer;
+    return output;
 };
 Ptr<EinsumLayer> EinsumLayer::create(const LayerParams& params)
 {
diff --git a/modules/dnn/src/layers/fully_connected_layer.cpp b/modules/dnn/src/layers/fully_connected_layer.cpp
index 1d655a50f6..c938638f35 100644
--- a/modules/dnn/src/layers/fully_connected_layer.cpp
+++ b/modules/dnn/src/layers/fully_connected_layer.cpp
@@ -453,13 +453,6 @@ public:
                 ret = false;
                 break;
             }
-
-            if (!use_half && bias && (outerSize > 1))
-            {
-                UMat biasOnesMat = UMat::ones(outerSize, 1, umat_blobs[0].type());
-                UMat& biases = umat_blobs[1];
-                cv::gemm(biasOnesMat, biases, 1, dstMat, 1, dstMat, 0);
-            }
         }
 
         if (ret) return true;
diff --git a/modules/dnn/src/layers/group_norm_layer.cpp b/modules/dnn/src/layers/group_norm_layer.cpp
new file mode 100644
index 0000000000..006e8fe7f8
--- /dev/null
+++ b/modules/dnn/src/layers/group_norm_layer.cpp
@@ -0,0 +1,190 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "../precomp.hpp"
+#include <opencv2/dnn/shape_utils.hpp>
+#include "./cpu_kernels/fast_norm.hpp"
+
+// CUDA backend
+#include "../op_cuda.hpp"
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/group_norm.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
+// OpenCL backend
+#ifdef HAVE_OPENCL
+#include "../ocl4dnn/include/math_functions.hpp"
+#include "opencl_kernels_dnn.hpp"
+#endif
+
+namespace cv {
+namespace dnn {
+
+// https://github.com/onnx/onnx/blob/main/docs/Operators.md#GroupNormalization
+class GroupNormLayerImpl CV_FINAL : public GroupNormLayer {
+public:
+    GroupNormLayerImpl(const LayerParams &params) {
+        setParamsFrom(params);
+
+        epsilon = params.get<float>("epsilon", 1e-5);
+        num_groups = params.get<int>("num_groups");
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE {
+        return backendId == DNN_BACKEND_OPENCV ||
+               backendId == DNN_BACKEND_CUDA;
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE {
+        const auto &input = inputs[0];
+        const auto &scale = inputs[1];
+        const auto &bias = inputs[2];
+        CV_CheckGE(input.size(), static_cast<size_t>(3), "DNN/GroupNorm: input dimension >= 3 is required");
+
+        int C = input[1];
+        int scale_dim = std::accumulate(scale.begin(), scale.end(), 1, std::multiplies<int>());
+        CV_CheckEQ(scale_dim, C, "DNN/InstanceNorm: scale must be a 1d tensor and match the channel of input");
+        int bias_dim = std::accumulate(bias.begin(), bias.end(), 1, std::multiplies<int>());
+        CV_CheckEQ(bias_dim, C, "DNN/InstanceNorm: bias must be a 1d tensor and match the channel of input");
+
+        outputs.assign(1, inputs[0]);
+        return false;
+    }
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        if (inputs_arr.depth() == CV_16S) {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
+
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        const auto& input = inputs[0];
+        const auto& scale = inputs[1];
+        const auto& bias = inputs[2];
+
+        fastNormGroup(input, scale, bias, outputs[0], epsilon, num_groups);
+    }
+
+#ifdef HAVE_OPENCL
+    bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_) {
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
+
+        inputs_.getUMatVector(inputs);
+        outputs_.getUMatVector(outputs);
+
+        const auto &input = inputs[0], &scale = inputs[1], &bias = inputs[2];
+        auto &output = outputs[0];
+
+        const auto input_shape = shape(input);
+        size_t N = input_shape[0], C = input_shape[1];
+        size_t num_groups = this->num_groups;
+        size_t channels_per_group = C / num_groups;
+        size_t loops = N * num_groups, norm_size = static_cast<size_t>(total(input_shape, 2)) * channels_per_group;
+        float inv_norm_size = 1.f / norm_size;
+
+        // no fp16 support
+        if (input.depth() == CV_16S) {
+            return false;
+        }
+
+        String base_opts = format(" -DT=float -DT4=float4 -Dconvert_T=convert_float4");
+
+        // Calculate mean
+        UMat one = UMat::ones(norm_size, 1, CV_32F);
+        UMat mean = UMat(loops, 1, CV_32F);
+        UMat mean_square = UMat(loops, 1, CV_32F);
+        UMat tmp = UMat(loops, norm_size, CV_32F);
+        bool ret = ocl4dnn::ocl4dnnGEMV<float>(ocl4dnn::CblasNoTrans, loops, norm_size, inv_norm_size,
+                                               input, 0, one, 0, 0.f, mean, 0);
+        if (!ret) {
+            return false;
+        }
+        // Calculate mean_square
+        int num_vector = (norm_size % 8 == 0) ? 8 : ((norm_size % 4 == 0) ? 4 : 1);
+        size_t global[] = {loops, static_cast<size_t>(norm_size / num_vector)};
+        String build_opt = format(" -DNUM=%d", num_vector) + base_opts;
+        String mean_square_kernel_name = format("calc_mean%d", num_vector);
+        ocl::Kernel mean_square_kernel(mean_square_kernel_name.c_str(), ocl::dnn::mvn_oclsrc, build_opt + " -DKERNEL_MEAN");
+        if (mean_square_kernel.empty()) {
+            return false;
+        }
+        mean_square_kernel.set(0, ocl::KernelArg::PtrReadOnly(input));
+        mean_square_kernel.set(1, (int)loops);
+        mean_square_kernel.set(2, (int)norm_size);
+        mean_square_kernel.set(3, ocl::KernelArg::PtrReadOnly(mean));
+        mean_square_kernel.set(4, ocl::KernelArg::PtrWriteOnly(tmp));
+        ret = mean_square_kernel.run(2, global, NULL, false);
+        if (!ret) {
+            return false;
+        }
+        ret = ocl4dnn::ocl4dnnGEMV<float>(ocl4dnn::CblasNoTrans, loops, norm_size, inv_norm_size,
+                                          tmp, 0, one, 0, 0.f, mean_square, 0);
+        if (!ret) {
+            return false;
+        }
+        // Calculate group norm: output = scale * (x - mean) / sqrt(var + eps) + bias
+        String mvn_group_kernel_name = format("mvn_group%d", num_vector);
+        build_opt += " -DNORM_VARIANCE -DKERNEL_MVN_GROUP";
+        ocl::Kernel mvn_group_kernel(mvn_group_kernel_name.c_str(), ocl::dnn::mvn_oclsrc, build_opt);
+        if (mvn_group_kernel.empty()) {
+            return false;
+        }
+        mvn_group_kernel.set(0, ocl::KernelArg::PtrReadOnly(input));
+        mvn_group_kernel.set(1, (int)loops);
+        mvn_group_kernel.set(2, (int)norm_size);
+        mvn_group_kernel.set(3, (float)epsilon);
+        mvn_group_kernel.set(4, ocl::KernelArg::PtrReadOnly(mean));
+        mvn_group_kernel.set(5, ocl::KernelArg::PtrReadOnly(mean_square));
+        mvn_group_kernel.set(6, ocl::KernelArg::PtrReadOnly(scale));
+        mvn_group_kernel.set(7, ocl::KernelArg::PtrReadOnly(bias));
+        mvn_group_kernel.set(8, (int)C);
+        mvn_group_kernel.set(9, (int)num_groups);
+        mvn_group_kernel.set(10, (float)0.f);
+        mvn_group_kernel.set(11, ocl::KernelArg::PtrWriteOnly(output));
+        ret = mvn_group_kernel.run(2, global, NULL, false);
+        if (!ret) {
+            return false;
+        }
+
+        return true;
+        }
+#endif
+
+#ifdef HAVE_CUDA
+    Ptr<BackendNode> initCUDA(void *context_,
+                          const std::vector<Ptr<BackendWrapper>>& inputs,
+                          const std::vector<Ptr<BackendWrapper>>& outputs) override {
+    auto context = reinterpret_cast<csl::CSLContext*>(context_);
+
+    auto input_wrapper = inputs[0].dynamicCast<CUDABackendWrapper>();
+    auto input_shape = input_wrapper->getShape();
+    size_t N = input_shape[0];
+    size_t num_groups = this->num_groups;
+    size_t loops = N * num_groups;
+
+    return make_cuda_node<cuda4dnn::GroupNormOp>(preferableTarget, std::move(context->stream), epsilon, loops, num_groups);
+}
+#endif // HAVE_CUDA
+
+private:
+    float epsilon;
+    size_t num_groups;
+};
+
+Ptr<GroupNormLayer> GroupNormLayer::create(const LayerParams &params) {
+    return Ptr<GroupNormLayer>(new GroupNormLayerImpl(params));
+}
+
+}} // cv::dnn
diff --git a/modules/dnn/src/layers/nary_eltwise_layers.cpp b/modules/dnn/src/layers/nary_eltwise_layers.cpp
index 2e9416fe25..5170a45cd1 100644
--- a/modules/dnn/src/layers/nary_eltwise_layers.cpp
+++ b/modules/dnn/src/layers/nary_eltwise_layers.cpp
@@ -24,6 +24,16 @@ namespace cv
 namespace dnn
 {
 
+namespace {
+static int _mod(int x, int y) {
+    int res = x % y;
+    if ((res < 0 && y > 0) || (res > 0 && y < 0)) {
+        res += y;
+    }
+    return res;
+}
+}
+
 class NaryEltwiseLayerImpl CV_FINAL : public NaryEltwiseLayer
 {
 public:
@@ -42,7 +52,8 @@ public:
         MAX,
         MEAN,
         MIN,
-        MOD,
+        MOD,  // Integer Mod. Reminder's sign = Divisor's sign.
+        FMOD, // Floating-point Mod. Reminder's sign = Dividend's sign.
         PROD,
         SUB,
         SUM,
@@ -79,6 +90,8 @@ public:
             op = OPERATION::MIN;
         else if (operation == "mod")
             op = OPERATION::MOD;
+        else if (operation == "fmod")
+            op = OPERATION::FMOD;
         else if (operation == "mul")
             op = OPERATION::PROD;
         else if (operation == "sub")
@@ -106,18 +119,21 @@ public:
 #ifdef HAVE_CANN
         if (backendId == DNN_BACKEND_CANN)
             return op == OPERATION::ADD || op == OPERATION::PROD || op == OPERATION::SUB ||
-                   op == OPERATION::DIV || op == OPERATION::MAX  || op == OPERATION::MIN;
+                   op == OPERATION::DIV || op == OPERATION::MAX  || op == OPERATION::MIN ||
+                   op == OPERATION::MOD || op == OPERATION::FMOD;
 #endif
         if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
             return (op == OPERATION::ADD ||
                     op == OPERATION::PROD ||
                     op == OPERATION::GREATER_EQUAL ||
-                    op == OPERATION::LESS_EQUAL
+                    op == OPERATION::LESS_EQUAL ||
+                    op == OPERATION::MOD ||
+                    op == OPERATION::FMOD
             );
         if (backendId == DNN_BACKEND_CUDA) {
-            return op == OPERATION::MAX || op == OPERATION::MIN || op == OPERATION::SUM ||
-                   op == OPERATION::PROD || op == OPERATION::DIV || op == OPERATION::ADD ||
-                   op == OPERATION::SUB;
+            return op == OPERATION::MAX  || op == OPERATION::MIN  || op == OPERATION::SUM ||
+                   op == OPERATION::PROD || op == OPERATION::DIV  || op == OPERATION::ADD ||
+                   op == OPERATION::SUB  || op == OPERATION::MOD || op == OPERATION::FMOD;
         }
         return backendId == DNN_BACKEND_OPENCV;
     }
@@ -707,10 +723,16 @@ public:
             }
             case OPERATION::MOD:
             {
-                auto mod = [](const uint8_t &a, const uint8_t &b) { return a % b; };
+                auto mod = [] (const T &a, const T &b) { return static_cast<T>(_mod(int(a), int(b))); };
                 binary_forward<T>(mod, std::forward<Args>(args)...);
                 break;
             }
+            case OPERATION::FMOD:
+            {
+                auto fmod = [](const T &a, const T &b) { return std::fmod(a, b); };
+                binary_forward<T>(fmod, std::forward<Args>(args)...);
+                break;
+            }
             case OPERATION::PROD:
             {
                 auto prod = [](const T &a, const T &b) { return a * b; };
@@ -782,9 +804,8 @@ public:
                 opDispatch<int32_t>(std::forward<Args>(args)...);
                 break;
             case CV_32F:
-                CV_Assert(op != OPERATION::BITSHIFT && op != OPERATION::MOD &&
-                          op != OPERATION::AND && op != OPERATION::OR &&
-                          op != OPERATION::XOR);
+                CV_Assert(op != OPERATION::BITSHIFT && op != OPERATION::AND &&
+                          op != OPERATION::OR && op != OPERATION::XOR);
                 opDispatch<float>(std::forward<Args>(args)...);
                 break;
             default:
@@ -801,19 +822,6 @@ public:
     {
         auto context = reinterpret_cast<csl::CSLContext*>(context_);
 
-        auto input_0_shape = inputs[0].dynamicCast<CUDABackendWrapper>()->getShape();
-        for (int i = 1; i < inputs.size(); i++)
-        {
-            auto input_i_shape = inputs[i].dynamicCast<CUDABackendWrapper>()->getShape();
-            if (input_0_shape.size() != input_i_shape.size())
-                return Ptr<BackendNode>();
-            // check if the shape can be supported by `eltwise_ops.cu`, or return the default BackendNode
-            for (int j = 0; j < input_0_shape.size(); j++)
-                if (input_0_shape[j] != input_i_shape[j] &&
-                    input_0_shape[j] != 1 && input_i_shape[j] != 1)
-                    return Ptr<BackendNode>();
-        }
-
         cuda4dnn::EltwiseOpType op_ = cuda4dnn::EltwiseOpType::SUM;
         switch (op) {
             case OPERATION::MAX:
@@ -837,6 +845,12 @@ public:
             case OPERATION::SUB:
                 op_ = cuda4dnn::EltwiseOpType::SUB;
                 break;
+            case OPERATION::MOD:
+                op_ = cuda4dnn::EltwiseOpType::MOD;
+                break;
+            case OPERATION::FMOD:
+                op_ = cuda4dnn::EltwiseOpType::FMOD;
+                break;
             default: return Ptr<BackendNode>(); // return empty cuda_node if the EltwiseOpType is unsupported type.
         };
 
@@ -881,6 +895,8 @@ public:
             BUILD_CANN_ELTWISE_OP(OPERATION::DIV,  Xdivy,   name);
             BUILD_CANN_ELTWISE_OP(OPERATION::MAX,  Maximum, name);
             BUILD_CANN_ELTWISE_OP(OPERATION::MIN,  Minimum, name);
+            BUILD_CANN_ELTWISE_OP(OPERATION::MOD,  Mod,     name);
+            BUILD_CANN_ELTWISE_OP(OPERATION::FMOD, Mod,     name);
 #undef BUILD_CANN_ELTWISE_OP
             default: CV_Error(Error::StsNotImplemented, "Unsupported eltwise operation");
         }
@@ -927,6 +943,16 @@ public:
             node = std::make_shared<ngraph::op::v1::GreaterEqual>(inp0, inp1);
         else if (op == OPERATION::LESS_EQUAL)
             node = std::make_shared<ngraph::op::v1::LessEqual>(inp0, inp1);
+        // Ideally we should do this but int32 internal blobs are converted to float32 data type in inference.
+        // TODO: Remove data type convertion when we have type inference.
+        else if (op == OPERATION::MOD) {
+            auto inp0_i64 = std::make_shared<ngraph::op::Convert>(inp0, ngraph::element::i64);
+            auto inp1_i64 = std::make_shared<ngraph::op::Convert>(inp1, ngraph::element::i64);
+            auto mod = std::make_shared<ngraph::op::v1::FloorMod>(inp0_i64, inp1_i64);
+            node = std::make_shared<ngraph::op::Convert>(mod, ngraph::element::f32);
+        }
+        else if (op == OPERATION::FMOD)
+            node = std::make_shared<ngraph::op::v1::Mod>(inp0, inp1);
         else
             CV_Error(Error::StsNotImplemented, "Operation is not implemented for nGraph backend");
         return Ptr<BackendNode>(new InfEngineNgraphNode(node));
diff --git a/modules/dnn/src/layers/scatterND_layer.cpp b/modules/dnn/src/layers/scatterND_layer.cpp
index 648d35fc0c..0ab02146cb 100644
--- a/modules/dnn/src/layers/scatterND_layer.cpp
+++ b/modules/dnn/src/layers/scatterND_layer.cpp
@@ -74,6 +74,11 @@ public:
         CV_TRACE_FUNCTION();
         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 
+        if (inputs_arr.depth() == CV_16S) {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
+
         std::vector<Mat> inputs, outputs;
         inputs_arr.getMatVector(inputs);
         outputs_arr.getMatVector(outputs);
@@ -89,49 +94,59 @@ public:
     // NOTE: This impl does not check whether indices have duplicate entries.
     //       The last duplicate entry will overwrite the previous.
     template<typename T, typename Functor>
-    void forward_impl(const Functor& rd, const Mat& data, const Mat& indices, const Mat& updates, Mat& out)
-    {
-        data.copyTo(out);
+    void forward_impl(const Functor &reduce_operation, const Mat &input_mat, const Mat &indices_mat, const Mat &updates_mat, Mat& output_mat) {
+        input_mat.copyTo(output_mat);
 
-        const int* shape = data.size.p;
-        const size_t* step = data.step.p;
+        const auto &input_mat_shape = shape(input_mat);
+        std::vector<size_t> input_mat_step(input_mat_shape.size());
+        for (int i = 0; i < input_mat.dims; i++) {
+            input_mat_step[i] = static_cast<size_t>(input_mat.step.p[i] / sizeof(T));
+        }
 
-        const int ind_ndims = indices.dims;
-        const int* ind_shape = indices.size.p;
-        const T* p_indices = indices.ptr<const T>();
+        const int indices_mat_ndims = indices_mat.dims;
+        const auto &indices_mat_shape = shape(indices_mat);
 
-        const int upd_ndims = updates.dims;
-        const int* upd_shape = updates.size.p;
-        const T* p_updates = updates.ptr<const T>();
+        const int updates_mat_ndims = updates_mat.dims;
+        const auto &updates_mat_shape = shape(updates_mat);
 
-        T* p_out = out.ptr<T>();
-
-        int k = ind_shape[ind_ndims - 1]; // last dim of indices
-        size_t total = (size_t)(indices.total() / k);
+        int indices_last_dim = indices_mat_shape[indices_mat_ndims - 1]; // last dim of indices
 
         size_t updates_size = 1;
-        for (int i = ind_ndims - 1; i < upd_ndims; i++)
-            updates_size *= upd_shape[i];
+        for (int i = indices_mat_ndims - 1; i < updates_mat_ndims; i++)
+            updates_size *= updates_mat_shape[i];
 
-        size_t inp_start_offset = 0;
-        size_t ind_start_offset = 0;
-        size_t upd_start_offset = 0;
-        for (size_t i = 0; i < total; i++, ind_start_offset += k, upd_start_offset += updates_size)
-        {
-            const T* tmp_p_indices = p_indices + ind_start_offset;
-            inp_start_offset = 0;
-            for (int j = 0; j < k; j++)
-            {
-                CV_Assert(tmp_p_indices[j] < shape[j] && tmp_p_indices[j] > -shape[j]);
-                inp_start_offset += (((int)tmp_p_indices[j] + shape[j]) % shape[j]) * step[j];
+        auto fn = [&](const Range &r) {
+            size_t input_offset = 0,
+                   indices_offset = r.start * indices_last_dim,
+                   updates_offset = r.start * updates_size;
+            for (int i = r.start; i < r.end; i++) {
+                const T* indices = indices_mat.ptr<const T>();
+                const T* updates = updates_mat.ptr<const T>();
+                T* output = output_mat.ptr<T>();
+
+                input_offset = 0;
+                indices += indices_offset;
+                for (int j = 0; j < indices_last_dim; j++) {
+                    int index = static_cast<int>(*(indices + j));
+                    index = (index + input_mat_shape[j]) % input_mat_shape[j];
+                    CV_Assert(index < input_mat_shape[j] && index >= 0);
+                    input_offset += index * input_mat_step[j];
+                }
+
+                updates += updates_offset;
+                output += input_offset;
+                for (int j = 0; j < updates_size; j++) {
+                    output[j] = reduce_operation(output[j], updates[j]);
+                }
+
+                indices_offset += indices_last_dim;
+                updates_offset += updates_size;
             }
-            inp_start_offset /= sizeof(T);
+        };
 
-            const T* tmp_p_updates = p_updates + upd_start_offset;
-            T* tmp_p_out = p_out + inp_start_offset;
-            for (int j = 0; j < updates_size; j++)
-                tmp_p_out[j] = rd(tmp_p_out[j], tmp_p_updates[j]);
-        }
+        size_t total = (size_t)(indices_mat.total() / indices_last_dim);
+        double nstripes = (size_t)total * (indices_last_dim + updates_size) * (1 / 1024.0);
+        parallel_for_(Range(0, total), fn, nstripes);
     }
 
     template<typename... Args>
diff --git a/modules/dnn/src/layers/scatter_layer.cpp b/modules/dnn/src/layers/scatter_layer.cpp
index 084eecb03c..24e4b54bc8 100644
--- a/modules/dnn/src/layers/scatter_layer.cpp
+++ b/modules/dnn/src/layers/scatter_layer.cpp
@@ -68,6 +68,11 @@ public:
         CV_TRACE_FUNCTION();
         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 
+        if (inputs_arr.depth() == CV_16S) {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
+
         std::vector<Mat> inputs, outputs;
         inputs_arr.getMatVector(inputs);
         outputs_arr.getMatVector(outputs);
@@ -81,59 +86,62 @@ public:
     }
 
     template<typename T, typename Functor>
-    void forward_impl(const Functor& rd, const Mat& data, const Mat& indices, const Mat& updates, Mat& out)
-    {
-        data.copyTo(out);
+    void forward_impl(const Functor &reduce_operation, const Mat &input_mat, const Mat &indices_mat, const Mat &updates_mat, Mat &output_mat) {
+        input_mat.copyTo(output_mat);
 
-        const int ndims = data.dims;
-        const int* shape = data.size.p;
-        const size_t* step = data.step.p;
+        const int ndims = input_mat.dims;
 
-        const int* ind_shape = indices.size.p;
-        const size_t* ind_step = indices.step.p;
+        const auto &input_mat_shape = shape(input_mat);
+        std::vector<size_t> input_mat_step(ndims);
 
-        size_t inp_offset = 0;
-        size_t ind_offset = 0;
-        const T* p_index = indices.ptr<const T>();
-        const T* p_update = updates.ptr<const T>();
-        T* p_out = out.ptr<T>();
+        const auto &indices_mat_shape = shape(indices_mat);
+        std::vector<size_t> indices_mat_step(ndims);
 
-        size_t total = indices.total();
-
-        int j, offset_at_idx, index;
-        size_t t, idx;
-        for (size_t i = 0; i < total; i++)
-        {
-            t = i;
-            inp_offset = 0;
-            ind_offset = 0;
-            int offset_at_axis = 0;
-            for (j = ndims - 1; j >= 0; j--)
-            {
-                idx = t / ind_shape[j];
-                offset_at_idx = (int)(t - idx * ind_shape[j]);
-                ind_offset += offset_at_idx * ind_step[j];
-                inp_offset += offset_at_idx * step[j];
-                t = idx;
-                if (j == axis)
-                {
-                    offset_at_axis = offset_at_idx * step[j];
-                }
-            }
-            ind_offset /= sizeof(T);
-
-            // get index and overwrite current indices
-            const T* tmp_p_index = p_index + ind_offset;
-            index = (int)(*tmp_p_index);
-            CV_Assert(index < shape[axis] && index > -shape[axis]);
-
-            inp_offset = inp_offset - offset_at_axis + ((index + shape[axis]) % shape[axis]) * step[axis];
-            inp_offset /= sizeof(T);
-
-            const T* tmp_p_update = p_update + ind_offset;
-            T* tmp_p_out = p_out + inp_offset;
-            *tmp_p_out = rd(*tmp_p_out, *tmp_p_update);
+        for (int i = 0; i < ndims; i++) {
+            input_mat_step[i] = static_cast<size_t>(input_mat.step.p[i] / sizeof(T));
+            indices_mat_step[i] = static_cast<size_t>(indices_mat.step.p[i] / sizeof(T));
         }
+
+        auto fn = [&](const Range &r) {
+            size_t input_offset = 0, indices_offset = 0;
+
+            int indices_index, index;
+            size_t axis_offset, tmp_index, j_index;
+            for (int i = r.start; i < r.end; i++) {
+                const T* indices = indices_mat.ptr<const T>();
+                const T* updates = updates_mat.ptr<const T>();
+                T* output = output_mat.ptr<T>();
+
+                input_offset = 0;
+                indices_offset = 0;
+                indices_index = i;
+                axis_offset = 0;
+                for (int j = ndims - 1; j >= 0; j--) {
+                    tmp_index = indices_index / indices_mat_shape[j];
+                    j_index = (size_t)(indices_index - tmp_index * indices_mat_shape[j]);
+                    input_offset += j_index * input_mat_step[j];
+                    indices_offset += j_index * indices_mat_step[j];
+                    indices_index = tmp_index;
+                    if (j == axis) {
+                        axis_offset = j_index * input_mat_step[j];
+                    }
+                }
+
+                // get index and overwrite current indices
+                index = static_cast<int>(*(indices + indices_offset));
+                index = (index + input_mat_shape[axis]) % input_mat_shape[axis];
+                CV_Assert(index < input_mat_shape[axis] && index >= 0);
+                input_offset = input_offset - axis_offset + index * input_mat_step[axis];
+
+                updates += indices_offset;
+                output += input_offset;
+                *output = reduce_operation(*output, *updates);
+            }
+        };
+
+        size_t total = indices_mat.total();
+        double nstripes = (size_t)total * ndims * (1 / 1024.0);
+        parallel_for_(Range(0, total), fn, nstripes);
     }
 
     template<typename... Args>
diff --git a/modules/dnn/src/net_impl.cpp b/modules/dnn/src/net_impl.cpp
index 732d2e2a34..21f3eac74f 100644
--- a/modules/dnn/src/net_impl.cpp
+++ b/modules/dnn/src/net_impl.cpp
@@ -901,7 +901,6 @@ AsyncArray Net::Impl::forwardAsync(const String& outputName)
     CV_Assert(!empty());
     FPDenormalsIgnoreHintScope fp_denormals_ignore_scope;
 
-#ifdef CV_CXX11
     String layerName = outputName;
 
     if (layerName.empty())
@@ -922,9 +921,6 @@ AsyncArray Net::Impl::forwardAsync(const String& outputName)
     isAsync = false;
 
     return getBlobAsync(layerName);
-#else
-    CV_Error(Error::StsNotImplemented, "DNN: Asynchronous forward requires build with enabled C++11");
-#endif  // CV_CXX11
 }
 
 
diff --git a/modules/dnn/src/net_impl.hpp b/modules/dnn/src/net_impl.hpp
index bd1fa213f7..364f8e9d35 100644
--- a/modules/dnn/src/net_impl.hpp
+++ b/modules/dnn/src/net_impl.hpp
@@ -265,11 +265,9 @@ struct Net::Impl : public detail::NetImplBase
 
     Mat getBlob(String outputName) const;
 
-#ifdef CV_CXX11
     virtual AsyncArray getBlobAsync(const LayerPin& pin);
 
     AsyncArray getBlobAsync(String outputName);
-#endif  // CV_CXX11
 
     string dump(bool forceAllocation = false) const;
 
diff --git a/modules/dnn/src/net_impl_fuse.cpp b/modules/dnn/src/net_impl_fuse.cpp
index dfa542bd41..b81bf14acc 100644
--- a/modules/dnn/src/net_impl_fuse.cpp
+++ b/modules/dnn/src/net_impl_fuse.cpp
@@ -728,6 +728,10 @@ void Net::Impl::fuseLayers(const std::vector<LayerPin>& blobsToKeep_)
                     if(inp_i_data->skip || inp_i_data->consumers.size() != 1)
                         break;
 #ifdef HAVE_CUDA
+                    /* Risk: Not every operation in "NaryEltwise" is supported in the CUDA backend. There is a chance
+                             that Concat's output is filled with data in both host and device, leading to data missing.
+                             See https://github.com/opencv/opencv/issues/24721 for more details.
+                    */
                     if (preferableBackend == DNN_BACKEND_CUDA &&
                         (inp_i_data->layerInstance->supportBackend(DNN_BACKEND_CUDA) == false ||
                          (inp_i_data->layerInstance->type != "Convolution" &&
diff --git a/modules/dnn/src/ocl4dnn/src/ocl4dnn_inner_product.cpp b/modules/dnn/src/ocl4dnn/src/ocl4dnn_inner_product.cpp
index ee7a2c7b01..d45ff8c634 100644
--- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_inner_product.cpp
+++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_inner_product.cpp
@@ -97,15 +97,19 @@ bool OCL4DNNInnerProduct<Dtype>::Forward(const UMat& bottom,
                                            max_image_size);
         }
 
-        if (use_half_ && bias_term_)
-        {
-            UMat biasOneMat = UMat::ones(M_, 1, CV_32F);
-            UMat newbias, tmpTop;
+        if (bias_term_) {
+            if (use_half_) {
+                UMat biasOneMat = UMat::ones(M_, 1, CV_32F);
+                UMat newbias, tmpTop;
 
-            convertFp16(bias, newbias);
-            convertFp16(top, tmpTop);
-            cv::gemm(biasOneMat, newbias, 1, tmpTop, 1, tmpTop, 0);
-            convertFp16(tmpTop, top);
+                convertFp16(bias, newbias);
+                convertFp16(top, tmpTop);
+                cv::gemm(biasOneMat, newbias, 1, tmpTop, 1, tmpTop, 0);
+                convertFp16(tmpTop, top);
+            } else {
+                UMat biasOnesMat = UMat::ones(M_, 1, CV_32F);
+                cv::gemm(biasOnesMat, bias, 1, top, 1, top, 0);
+            }
         }
 
         return ret;
diff --git a/modules/dnn/src/onnx/onnx_graph_simplifier.cpp b/modules/dnn/src/onnx/onnx_graph_simplifier.cpp
index 77dc1c52df..7b8dd483c7 100644
--- a/modules/dnn/src/onnx/onnx_graph_simplifier.cpp
+++ b/modules/dnn/src/onnx/onnx_graph_simplifier.cpp
@@ -86,6 +86,7 @@ public:
     int getTensorShapeSize(int node_id, int node_input_id) {
         const auto node = getNode(node_id);
         const auto &input_name = node->getInputName(node_input_id);
+        // try to get from value_info
         for (int i = 0; i < net.value_info_size(); i++) {
             const auto value_info = net.value_info(i);
             if (value_info.name() == input_name) {
@@ -97,6 +98,18 @@ public:
                 }
             }
         }
+        // try to get from input
+        for (int i = 0; i < net.input_size(); i++) {
+            const auto input = net.input(i);
+            if (input.name() == input_name) {
+                if (input.has_type() && input.type().has_tensor_type() &&
+                    input.type().tensor_type().has_shape()) {
+                    return input.type().tensor_type().shape().dim_size();
+                } else {
+                    return -1;
+                }
+            }
+        }
         return -1;
     }
 
@@ -660,6 +673,10 @@ private:
         [Input] -> LayerNorm -> [Output]
                         \
                     [weight], [bias]
+
+    Note: axes of ReduceMean must be:
+          - last element is the axis of last dimension (-1 or (input_ndims - 1))
+          - a list of adjacent axes, e.g. [1, 2, 3, ..., input_ndims - 1]
 */
 class LayerNormSubGraph : public Subgraph
 {
@@ -683,19 +700,22 @@ public:
         setFusedNode("LayerNormalization", input);
     }
 
-    static float extractAxis(const Ptr<ImportGraphWrapper>& net, int node_id)
+    static std::vector<int64_t> extractAxis(const Ptr<ImportGraphWrapper>& net, int node_id)
     {
+        // TODO: consider ReduceMean-18 which has axes as one of the inputs instead of attributes
         Ptr<ImportNodeWrapper> mean_ptr = net->getNode(node_id);
         opencv_onnx::NodeProto* mean_node = mean_ptr.dynamicCast<ONNXNodeWrapper>()->node;
-        int axis_ = -1;
+        std::vector<int64_t> axes;
         for (int i = 0; i < mean_node->attribute_size(); i++)
         {
             opencv_onnx::AttributeProto attr = mean_node->attribute(i);
             if (attr.name() != "axes")
                 continue;
-            axis_ = static_cast<int>(attr.ints(0));
+            for (int j = 0; j < attr.ints_size(); j++) {
+                axes.push_back(attr.ints(j));
+            }
         }
-        return axis_;
+        return axes;
     }
 
     virtual bool match(const Ptr<ImportGraphWrapper>& net, int nodeId,
@@ -707,11 +727,31 @@ public:
             if (pow_exp - 2 > 1e-5) // not pow(2)
                 return false;
 
-            int axis_mean1 = extractAxis(net, matchedNodesIds[mean]);
-            int axis_mean2 = extractAxis(net, matchedNodesIds[mean1]);
-            if (axis_mean1 != axis_mean2)
+            std::vector<int64_t> axes = extractAxis(net, matchedNodesIds[mean]);
+            // check whether it is -1 or last_axis or [axis, ..., last_axis]
+            int64_t input_ndims = static_cast<int64_t>(net.dynamicCast<ONNXGraphWrapper>()->getTensorShapeSize(matchedNodesIds[mean], 0));
+            if (input_ndims == -1) {
+                return false; // input shape unknown
+            }
+            // assume that axes are sorted in ascending order, e.g. [0, 1, 2, 3] or [-3, -2, -1]
+            if (axes.back() != -1 && axes.back() != (input_ndims - 1)) {
                 return false;
-            axis = axis_mean1;
+            }
+            for (size_t i = 0; i < axes.size() - 1; i++) {
+                if (axes[i] - axes[i + 1] != -1) {
+                    return false;
+                }
+            }
+
+            std::vector<int64_t> axes1 = extractAxis(net, matchedNodesIds[mean1]);
+            if (axes.size() != axes1.size())
+                return false;
+            for (size_t i = 0; i < axes.size(); i++) {
+                if (((axes[i] + input_ndims) % input_ndims) != ((axes1[i] + input_ndims) % input_ndims)) {
+                    return false;
+                }
+            }
+            axis = axes[0];
 
             epsilon = extractConstant(net, matchedNodesIds[add], 1).at<float>(0);
 
diff --git a/modules/dnn/src/onnx/onnx_importer.cpp b/modules/dnn/src/onnx/onnx_importer.cpp
index 09ebce7476..fee9d8c913 100644
--- a/modules/dnn/src/onnx/onnx_importer.cpp
+++ b/modules/dnn/src/onnx/onnx_importer.cpp
@@ -22,6 +22,7 @@
 
 #ifdef HAVE_PROTOBUF
 
+#include <array>
 #include <iostream>
 #include <fstream>
 #include <string>
@@ -2619,6 +2620,7 @@ void ONNXImporter::parseConcat(LayerParams& layerParams, const opencv_onnx::Node
 
         // Concat-1 has default value for axis is 1: https://github.com/onnx/onnx/blob/master/docs/Changelog.md#Concat-1
         int axis = layerParams.get<int>("axis", 1);
+        axis = normalize_axis(axis, inputShape.size());
         for (size_t i = 0; i < inputs.size(); ++i)
         {
             inputShape[axis] = inputs[i].dims == (int)inputShape.size() ? inputs[i].size[axis] : 1;
@@ -2831,6 +2833,11 @@ void ONNXImporter::parseElementWise(LayerParams& layerParams, const opencv_onnx:
 
     layerParams.type = "NaryEltwise";
     layerParams.set("operation", toLowerCase(node_proto.op_type()));
+    if (node_proto.op_type() == "Mod") {
+        if (layerParams.get<int>("fmod", 0)) {
+            layerParams.set("operation", "fmod");
+        };
+    }
 
     // element-wise layers that can have >=1 inputs but actually have one input
     if (node_proto.input_size() == 1 && (op_type == "max" || op_type == "min" || op_type == "mean" || op_type == "sum"))
@@ -4004,10 +4011,11 @@ void ONNXImporter::buildDispatchMap_ONNX_AI(int opset_version)
     dispatch["ScatterElements"] = dispatch["Scatter"] = dispatch["ScatterND"] = &ONNXImporter::parseScatter;
     dispatch["Tile"] = &ONNXImporter::parseTile;
     dispatch["LayerNormalization"] = &ONNXImporter::parseLayerNorm;
+    dispatch["GroupNormalization"] = &ONNXImporter::parseInstanceNormalization;
 
     dispatch["Equal"] = dispatch["Greater"] = dispatch["Less"] = dispatch["Pow"] = dispatch["Add"] =
             dispatch["Sub"] = dispatch["Mul"] = dispatch["Div"] = dispatch["GreaterOrEqual"] =
-            dispatch["LessOrEqual"] = &ONNXImporter::parseElementWise;
+            dispatch["LessOrEqual"] = dispatch["Mod"] = &ONNXImporter::parseElementWise;
 
     dispatch["Sum"] = dispatch["Min"] = dispatch["Max"] = &ONNXImporter::parseElementWise;
     dispatch["Where"] = &ONNXImporter::parseElementWise;
diff --git a/modules/dnn/src/opencl/mvn.cl b/modules/dnn/src/opencl/mvn.cl
index 7353ed8b82..053749b483 100644
--- a/modules/dnn/src/opencl/mvn.cl
+++ b/modules/dnn/src/opencl/mvn.cl
@@ -54,6 +54,7 @@
     #define vec_type Dtype8
     #define CALC_MEAN calc_mean8
     #define MVN mvn8
+    #define MVN_GROUP mvn_group8
     #define MEAN_FUSE mean_fuse8
     #define MVN_FUSE mvn_fuse8
 #elif NUM == 4
@@ -62,6 +63,7 @@
     #define vec_type Dtype4
     #define CALC_MEAN calc_mean4
     #define MVN mvn4
+    #define MVN_GROUP mvn_group4
     #define MEAN_FUSE mean_fuse4
     #define MVN_FUSE mvn_fuse4
 #elif NUM == 1
@@ -70,6 +72,7 @@
     #define vec_type Dtype
     #define CALC_MEAN calc_mean1
     #define MVN mvn1
+    #define MVN_GROUP mvn_group1
     #define MEAN_FUSE mean_fuse1
     #define MVN_FUSE mvn_fuse1
 #endif
@@ -150,6 +153,54 @@ __kernel void MVN(__global const Dtype* src,
     store(dst_vec, dst, index);
 }
 
+#elif defined KERNEL_MVN_GROUP
+
+__kernel void MVN_GROUP(__global const Dtype* src,
+                            const int rows,
+                            const int cols,
+                            const Dtype eps,
+                            __global const Dtype* mean,
+                            __global const Dtype* dev,
+                            __global const Dtype* weight,
+                            __global const Dtype* bias,
+                            const int channels,
+                            const int num_groups,
+                            const float relu_slope,
+                            __global Dtype* dst)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * NUM;
+    int index = x * cols + y;
+
+    if (x >= rows || y >= cols)
+        return;
+
+    int group_size = channels / num_groups;
+    int step = norm_size / group_size;
+    int channel_index = x % num_groups * group_size + y / step
+    Dtype mean_val = mean[x];
+    Dtype dev_val = dev[x];
+    Dtype alpha;
+#ifdef NORM_VARIANCE
+    alpha = 1 / sqrt(eps + dev_val);
+#else
+    alpha = 1;
+#endif
+
+    Dtype w = weight[channel_index], b = bias[channel_index];
+
+    vec_type src_vec = load(src, index) - (vec_type)mean_val;
+    vec_type dst_vec = src_vec * alpha;
+    dst_vec = dst_vec * w + (vec_type)b;
+
+#ifdef FUSE_RELU
+    vec_type new_val = dst_vec * relu_slope;
+    dst_vec = select(new_val, dst_vec, dst_vec > (vec_type)0.f);
+#endif
+
+    store(dst_vec, dst, index);
+}
+
 #elif defined KERNEL_MEAN_FUSE
 
 __kernel void MEAN_FUSE(__global const T * A,
diff --git a/modules/dnn/test/test_backends.cpp b/modules/dnn/test/test_backends.cpp
index fda2ec5b0e..2c106f307d 100644
--- a/modules/dnn/test/test_backends.cpp
+++ b/modules/dnn/test/test_backends.cpp
@@ -95,6 +95,12 @@ public:
     Net net;
 };
 
+TEST_P(DNNTestNetwork, DISABLED_YOLOv8n) {
+    processNet("dnn/onnx/models/yolov8n.onnx", "", Size(640, 640), "output0");
+    expectNoFallbacksFromIE(net);
+    expectNoFallbacksFromCUDA(net);
+}
+
 TEST_P(DNNTestNetwork, AlexNet)
 {
     applyTestTag(CV_TEST_TAG_MEMORY_1GB);
@@ -1454,6 +1460,71 @@ INSTANTIATE_TEST_CASE_P(Layer_Test_Backends, Eltwise, testing::Combine(
                dnnBackendsAndTargets()
 ));
 
+////////////////////////////////////////////////////////////////////////////////
+// Element-wise layers
+////////////////////////////////////////////////////////////////////////////////
+using NaryEltwiseConcat = TestWithParam<tuple<std::vector<int>, tuple<Backend, Target>>>;
+TEST_P(NaryEltwiseConcat, Accuracy) {
+    auto param = GetParam();
+    std::vector<int> input_shape = get<0>(param);
+    auto backend_id = get<0>(get<1>(param));
+    auto target_id = get<1>(get<1>(param));
+
+    /* Build the following net:
+
+           <1x4x84>
+           /
+        [Input] -+-> Mul(B<1x84>) -> Concat(axis=1) -> [Output]
+                 |                     |
+                 +-> Sigmoid ----------+
+
+    */
+    Net net;
+
+    std::vector<int> mul_B_shape(input_shape.size() - 1, 1);
+    mul_B_shape.back() = input_shape.back();
+    Mat mul_B(mul_B_shape, CV_32FC1);
+    randn(mul_B, 0.f, 1.f);
+    LayerParams mul_B_lp;
+    mul_B_lp.name = "mul_B";
+    mul_B_lp.type = "Const";
+    mul_B_lp.blobs.push_back(mul_B);
+    int id_mul_B = net.addLayer(mul_B_lp.name, mul_B_lp.type, mul_B_lp);
+
+    LayerParams mul_lp;
+    mul_lp.name = "mul";
+    mul_lp.type = "NaryEltwise";
+    mul_lp.set("operation", "mul");
+    int id_mul = net.addLayer(mul_lp.name, mul_lp.type, mul_lp);
+    net.connect(0, 0, id_mul, 0);
+    net.connect(id_mul_B, 0, id_mul, 1);
+
+    LayerParams sigmoid_lp;
+    sigmoid_lp.name = "sigmoid";
+    sigmoid_lp.type = "Sigmoid";
+    int id_sigmoid = net.addLayer(sigmoid_lp.name, sigmoid_lp.type, sigmoid_lp);
+    net.connect(0, 0, id_sigmoid, 0);
+
+    LayerParams concat_lp;
+    concat_lp.name = "concat";
+    concat_lp.type = "Concat";
+    concat_lp.set("axis", 1);
+    int id_concat = net.addLayer(concat_lp.name, concat_lp.type, concat_lp);
+    net.connect(id_mul, 0, id_concat, 0);
+    net.connect(id_sigmoid, 0, id_concat, 1);
+
+    // Run test
+    Mat input(input_shape, CV_32FC1);
+    testLayer(input, net, backend_id, target_id, false);
+}
+
+INSTANTIATE_TEST_CASE_P(Layer_Test_Backends, NaryEltwiseConcat, testing::Combine(
+    testing::Values(std::vector<int>{1, 4, 84}),
+    dnnBackendsAndTargets())
+);
+
+
+
 INSTANTIATE_TEST_CASE_P(/*nothing*/, Test_layers_backends, dnnBackendsAndTargets());
 
 }} // namespace
diff --git a/modules/dnn/test/test_graph_simplifier.cpp b/modules/dnn/test/test_graph_simplifier.cpp
index e09a68c158..91b4e271f5 100644
--- a/modules/dnn/test/test_graph_simplifier.cpp
+++ b/modules/dnn/test/test_graph_simplifier.cpp
@@ -47,6 +47,10 @@ TEST_F(Test_Graph_Simplifier, LayerNormSubGraph) {
     test("layer_norm_expanded_with_initializers", "LayerNormalization");
 }
 
+TEST_F(Test_Graph_Simplifier, LayerNormNoFusionSubGraph) {
+    test("layer_norm_no_fusion", std::vector<std::string>{"NaryEltwise", "Reduce", "Sqrt"});
+}
+
 TEST_F(Test_Graph_Simplifier, ResizeSubgraph) {
     /* Test for 6 subgraphs:
         - GatherCastSubgraph
diff --git a/modules/dnn/test/test_layers.cpp b/modules/dnn/test/test_layers.cpp
index 09a45718a4..b2126ea447 100644
--- a/modules/dnn/test/test_layers.cpp
+++ b/modules/dnn/test/test_layers.cpp
@@ -2050,7 +2050,7 @@ private:
         net.setPreferableTarget(target);
 
         Mat re;
-        ASSERT_NO_THROW(re = net.forward()); // runtime error
+        re = net.forward();
         auto ptr_re = (float *) re.data;
         for (int i = 0; i < re.total(); i++)
             if (op == "sum"){
diff --git a/modules/dnn/test/test_misc.cpp b/modules/dnn/test/test_misc.cpp
index 6b7973db7e..83352dd150 100644
--- a/modules/dnn/test/test_misc.cpp
+++ b/modules/dnn/test/test_misc.cpp
@@ -1033,14 +1033,10 @@ TEST_P(Test_two_inputs, basic)
     randu(firstInp, 0, 100);
     randu(secondInp, 0, 100);
 
-#ifndef CV_CXX11
     std::vector<String> input_names;
     input_names.push_back("data");
     input_names.push_back("second_input");
     net.setInputsNames(input_names);
-#else
-    net.setInputsNames({"data", "second_input"});
-#endif
     net.setInput(firstInp, "data", kScale);
     net.setInput(secondInp, "second_input", kScaleInv);
     net.setPreferableBackend(backendId);
diff --git a/modules/dnn/test/test_onnx_conformance.cpp b/modules/dnn/test/test_onnx_conformance.cpp
index dfb8c4d1e5..a3e9a7020b 100644
--- a/modules/dnn/test/test_onnx_conformance.cpp
+++ b/modules/dnn/test/test_onnx_conformance.cpp
@@ -311,6 +311,8 @@ static const TestCase testConformanceConfig[] = {
     {"test_gridsample_nearest", 2, 1},
     {"test_gridsample_reflection_padding", 2, 1},
     {"test_gridsample_zeros_padding", 2, 1},
+    {"test_group_normalization_epsilon", 3, 1},
+    {"test_group_normalization_example", 3, 1},
     {"test_gru_batchwise", 3, 2},
     {"test_gru_defaults", 3, 1},
     {"test_gru_seq_length", 4, 1},
diff --git a/modules/dnn/test/test_onnx_conformance_layer_filter__openvino.inl.hpp b/modules/dnn/test/test_onnx_conformance_layer_filter__openvino.inl.hpp
index 17d561d64b..291ea30e92 100644
--- a/modules/dnn/test/test_onnx_conformance_layer_filter__openvino.inl.hpp
+++ b/modules/dnn/test/test_onnx_conformance_layer_filter__openvino.inl.hpp
@@ -736,6 +736,10 @@ CASE(test_gridsample_reflection_padding)
     // no filter
 CASE(test_gridsample_zeros_padding)
     // no filter
+CASE(test_group_normalization_epsilon)
+    // no filter
+CASE(test_group_normalization_example)
+    // no filter
 CASE(test_gru_batchwise)
     // no filter
 CASE(test_gru_defaults)
@@ -1056,10 +1060,25 @@ CASE(test_mod_int64_fmod)
     // no filter
 CASE(test_mod_mixed_sign_float16)
     // no filter
+    if (target == DNN_TARGET_OPENCL)
+    {
+        default_l1 = 0.0011;  // Expected: (normL1) <= (l1), actual: 0.00104141 vs 1e-05
+        default_lInf = 0.0016;  // Expected: (normInf) <= (lInf), actual: 0.00156212 vs 0.0001
+    }
 CASE(test_mod_mixed_sign_float32)
     // no filter
+    if (target == DNN_TARGET_OPENCL)
+    {
+        default_l1 = 0.0011;  // Expected: (normL1) <= (l1), actual: 0.00104141 vs 1e-05
+        default_lInf = 0.0016;  // Expected: (normInf) <= (lInf), actual: 0.00156212 vs 0.0001
+    }
 CASE(test_mod_mixed_sign_float64)
     // no filter
+    if (target == DNN_TARGET_OPENCL)
+    {
+        default_l1 = 0.0011;  // Expected: (normL1) <= (l1), actual: 0.00104167 vs 1e-05
+        default_lInf = 0.0016;  // Expected: (normInf) <= (lInf), actual: 0.00156251 vs 0.0001
+    }
 CASE(test_mod_mixed_sign_int16)
     // no filter
 CASE(test_mod_mixed_sign_int32)
diff --git a/modules/dnn/test/test_onnx_conformance_layer_filter_opencv_all_denylist.inl.hpp b/modules/dnn/test/test_onnx_conformance_layer_filter_opencv_all_denylist.inl.hpp
index e2ea428939..0da0111990 100644
--- a/modules/dnn/test/test_onnx_conformance_layer_filter_opencv_all_denylist.inl.hpp
+++ b/modules/dnn/test/test_onnx_conformance_layer_filter_opencv_all_denylist.inl.hpp
@@ -41,7 +41,7 @@
 "test_cast_STRING_to_FLOAT",
 "test_castlike_FLOAT_to_STRING_expanded",
 "test_castlike_STRING_to_FLOAT_expanded",
-"test_concat_1d_axis_negative_1",
+"test_concat_1d_axis_negative_1", // 1d support is required
 "test_div_uint8",  // output type mismatch
 "test_maxpool_2d_dilations",
 "test_maxpool_2d_same_lower",
diff --git a/modules/dnn/test/test_onnx_conformance_layer_parser_denylist.inl.hpp b/modules/dnn/test/test_onnx_conformance_layer_parser_denylist.inl.hpp
index 29b5ac46cc..2985a71e8e 100644
--- a/modules/dnn/test/test_onnx_conformance_layer_parser_denylist.inl.hpp
+++ b/modules/dnn/test/test_onnx_conformance_layer_parser_denylist.inl.hpp
@@ -210,9 +210,6 @@
 "test_min_uint8",
 "test_mod_broadcast",
 "test_mod_int64_fmod",
-"test_mod_mixed_sign_float16",
-"test_mod_mixed_sign_float32",
-"test_mod_mixed_sign_float64",
 "test_mod_mixed_sign_int16",
 "test_mod_mixed_sign_int32",
 "test_mod_mixed_sign_int64",
diff --git a/modules/dnn/test/test_onnx_importer.cpp b/modules/dnn/test/test_onnx_importer.cpp
index a128dd9f79..3149939369 100644
--- a/modules/dnn/test/test_onnx_importer.cpp
+++ b/modules/dnn/test/test_onnx_importer.cpp
@@ -2673,24 +2673,36 @@ void yoloPostProcessing(
         cv::transposeND(outs[0], {0, 2, 1}, outs[0]);
     }
 
-    // each row is [cx, cy, w, h, conf_obj, conf_class1, ..., conf_class80]
+    if (test_name == "yolonas"){
+        // outs contains 2 elemets of shape [1, 8400, 80] and [1, 8400, 4]. Concat them to get [1, 8400, 84]
+        Mat concat_out;
+        // squeeze the first dimension
+        outs[0] = outs[0].reshape(1, outs[0].size[1]);
+        outs[1] = outs[1].reshape(1, outs[1].size[1]);
+        cv::hconcat(outs[1], outs[0], concat_out);
+        outs[0] = concat_out;
+        // remove the second element
+        outs.pop_back();
+        // unsqueeze the first dimension
+        outs[0] = outs[0].reshape(0, std::vector<int>{1, 8400, 84});
+    }
+
     for (auto preds : outs){
 
         preds = preds.reshape(1, preds.size[1]); // [1, 8400, 85] -> [8400, 85]
-
         for (int i = 0; i < preds.rows; ++i)
         {
-            // filter out non objects
-            float obj_conf = (test_name != "yolov8") ? preds.at<float>(i, 4) : 1.0f;
+            // filter out non object
+            float obj_conf = (test_name == "yolov8" || test_name == "yolonas") ? 1.0f : preds.at<float>(i, 4) ;
             if (obj_conf < conf_threshold)
                 continue;
 
-            Mat scores = preds.row(i).colRange((test_name != "yolov8") ? 5 : 4, preds.cols);
+            Mat scores = preds.row(i).colRange((test_name == "yolov8" || test_name == "yolonas") ? 4 : 5, preds.cols);
             double conf;
             Point maxLoc;
             minMaxLoc(scores, 0, &conf, 0, &maxLoc);
 
-            conf = (test_name != "yolov8") ? conf * obj_conf : conf;
+            conf = (test_name == "yolov8" || test_name == "yolonas") ? conf : conf * obj_conf;
             if (conf < conf_threshold)
                 continue;
 
@@ -2701,10 +2713,15 @@ void yoloPostProcessing(
             double w = det[2];
             double h = det[3];
 
+            // std::cout << "cx: " << cx << " cy: " << cy << " w: " << w << " h: " << h << " conf: " << conf << " idx: " << maxLoc.x << std::endl;
             // [x1, y1, x2, y2]
-            boxes.push_back(Rect2d(cx - 0.5 * w, cy - 0.5 * h,
-                                    cx + 0.5 * w, cy + 0.5 * h));
-            classIds.push_back(maxLoc.x);
+            if (test_name == "yolonas"){
+                boxes.push_back(Rect2d(cx, cy, w, h));
+            } else {
+                boxes.push_back(Rect2d(cx - 0.5 * w, cy - 0.5 * h,
+                                        cx + 0.5 * w, cy + 0.5 * h));
+            }
+           classIds.push_back(maxLoc.x);
             confidences.push_back(conf);
         }
     }
@@ -2758,6 +2775,41 @@ TEST_P(Test_ONNX_nets, YOLOX)
         1.0e-4, 1.0e-4);
 }
 
+TEST_P(Test_ONNX_nets, YOLONas)
+{
+    // model information: https://dl.opencv.org/models/yolo-nas/Readme.md
+    std::string weightPath = _tf("models/yolo_nas_s.onnx", false);
+
+    Size targetSize{640, 640};
+    float conf_threshold = 0.50;
+    float iou_threshold = 0.50;
+
+    std::vector<int> refClassIds{1, 16, 7};
+    std::vector<float> refScores{0.9720f, 0.9283f, 0.8990f};
+    // [x1, y1, x2, y2]
+    std::vector<Rect2d> refBoxes{
+        Rect2d(105.516, 173.696, 471.323, 430.433),
+        Rect2d(109.241, 263.406, 259.872, 531.858),
+        Rect2d(390.153, 142.492, 574.932, 222.709)
+        };
+
+    Image2BlobParams imgParams(
+        Scalar::all(1/255.0),
+        targetSize,
+        Scalar::all(0),
+        false,
+        CV_32F,
+        DNN_LAYOUT_NCHW,
+        DNN_PMODE_LETTERBOX,
+        Scalar::all(114)
+        );
+
+    testYOLO(
+        weightPath, refClassIds, refScores, refBoxes,
+        imgParams, conf_threshold, iou_threshold,
+        1.0e-4, 1.0e-4, "yolonas");
+}
+
 TEST_P(Test_ONNX_nets, YOLOv8)
 {
     std::string weightPath = _tf("models/yolov8n.onnx", false);
@@ -2804,7 +2856,7 @@ TEST_P(Test_ONNX_nets, YOLOv7)
         CV_TEST_TAG_DEBUG_VERYLONG
     );
 
-    std::string weightPath = _tf("models/yolov7_not_simplified.onnx", false);
+    std::string weightPath = _tf("models/yolov7.onnx", false);
     // Reference, which is collected with input size of 640x640
     std::vector<int> refClassIds{1, 16, 7};
     std::vector<float> refScores{0.9614331f, 0.9589417f, 0.8679074f};
@@ -3031,6 +3083,10 @@ TEST_P(Test_ONNX_nets, VitTrack) {
     normAssert(ref_output3, outputs[2], "VitTrack output3");
 }
 
+TEST_P(Test_ONNX_layers, LayerNormNoFusion) {
+    testONNXModels("layer_norm_no_fusion");
+}
+
 INSTANTIATE_TEST_CASE_P(/**/, Test_ONNX_nets, dnnBackendsAndTargets());
 
 }} // namespace
diff --git a/modules/gapi/src/backends/ie/giebackend.cpp b/modules/gapi/src/backends/ie/giebackend.cpp
index 935f81d275..cdb246e4a2 100644
--- a/modules/gapi/src/backends/ie/giebackend.cpp
+++ b/modules/gapi/src/backends/ie/giebackend.cpp
@@ -2,7 +2,7 @@
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 //
-// Copyright (C) 2018-2023 Intel Corporation
+// Copyright (C) 2018-2024 Intel Corporation
 
 #include "precomp.hpp"
 
@@ -10,7 +10,7 @@
 // (cv::gapi::ie::backend() is still there and is defined always)
 #include "backends/ie/giebackend.hpp"
 
-#ifdef HAVE_INF_ENGINE
+#if defined HAVE_INF_ENGINE && INF_ENGINE_RELEASE < 2024000000
 
 #if INF_ENGINE_RELEASE <= 2019010000
 #   error G-API IE module supports only OpenVINO IE >= 2019 R1
diff --git a/modules/gapi/src/backends/ie/giebackend.hpp b/modules/gapi/src/backends/ie/giebackend.hpp
index c7d938878d..98715fc2db 100644
--- a/modules/gapi/src/backends/ie/giebackend.hpp
+++ b/modules/gapi/src/backends/ie/giebackend.hpp
@@ -2,7 +2,7 @@
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 //
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2018-2024 Intel Corporation
 
 #ifndef OPENCV_GAPI_GIEBACKEND_HPP
 #define OPENCV_GAPI_GIEBACKEND_HPP
@@ -10,7 +10,7 @@
 // Include anyway - cv::gapi::ie::backend() still needs to be defined
 #include "opencv2/gapi/infer/ie.hpp"
 
-#ifdef HAVE_INF_ENGINE
+#if defined HAVE_INF_ENGINE && INF_ENGINE_RELEASE < 2024000000
 
 #include <ade/util/algorithm.hpp> // type_list_index
 #include <condition_variable>
diff --git a/modules/gapi/src/backends/ie/giebackend/giewrapper.cpp b/modules/gapi/src/backends/ie/giebackend/giewrapper.cpp
index a185e7b8ce..6df8187e16 100644
--- a/modules/gapi/src/backends/ie/giebackend/giewrapper.cpp
+++ b/modules/gapi/src/backends/ie/giebackend/giewrapper.cpp
@@ -2,9 +2,9 @@
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 //
-// Copyright (C) 2020 Intel Corporation
+// Copyright (C) 2020-2024 Intel Corporation
 
-#ifdef HAVE_INF_ENGINE
+#if defined HAVE_INF_ENGINE && INF_ENGINE_RELEASE < 2024000000
 
 #include <vector>
 #include <string>
diff --git a/modules/gapi/test/infer/gapi_infer_ie_test.cpp b/modules/gapi/test/infer/gapi_infer_ie_test.cpp
index 92de39abfa..8e91d576aa 100644
--- a/modules/gapi/test/infer/gapi_infer_ie_test.cpp
+++ b/modules/gapi/test/infer/gapi_infer_ie_test.cpp
@@ -6,7 +6,7 @@
 
 #include "../test_precomp.hpp"
 
-#ifdef HAVE_INF_ENGINE
+#if defined HAVE_INF_ENGINE && INF_ENGINE_RELEASE < 2024000000
 
 #include <stdexcept>
 #include <mutex>
diff --git a/modules/highgui/src/window_QT.cpp b/modules/highgui/src/window_QT.cpp
index 142ab00827..26c0741fe9 100644
--- a/modules/highgui/src/window_QT.cpp
+++ b/modules/highgui/src/window_QT.cpp
@@ -1662,14 +1662,14 @@ CvWindow::CvWindow(QString name, int arg2)
 
     //Now attach everything
     if (myToolBar)
-        myGlobalLayout->addWidget(myToolBar, Qt::AlignCenter);
+        myGlobalLayout->addWidget(myToolBar, 0, Qt::AlignLeft);
 
-    myGlobalLayout->addWidget(myView->getWidget(), Qt::AlignCenter);
+    myGlobalLayout->addWidget(myView->getWidget(), 0, Qt::AlignCenter);
 
-    myGlobalLayout->addLayout(myBarLayout, Qt::AlignCenter);
+    myGlobalLayout->addLayout(myBarLayout);
 
     if (myStatusBar)
-        myGlobalLayout->addWidget(myStatusBar, Qt::AlignCenter);
+        myGlobalLayout->addWidget(myStatusBar, 0, Qt::AlignLeft);
 
     setLayout(myGlobalLayout);
     show();
@@ -2079,7 +2079,6 @@ void CvWindow::createStatusBar()
 {
     myStatusBar = new QStatusBar(this);
     myStatusBar->setSizeGripEnabled(false);
-    myStatusBar->setFixedHeight(20);
     myStatusBar->setMinimumWidth(1);
     myStatusBar_msg = new QLabel;
 
diff --git a/modules/imgcodecs/src/grfmt_jpeg.cpp b/modules/imgcodecs/src/grfmt_jpeg.cpp
index 506cebdf49..54dfea5a75 100644
--- a/modules/imgcodecs/src/grfmt_jpeg.cpp
+++ b/modules/imgcodecs/src/grfmt_jpeg.cpp
@@ -409,7 +409,9 @@ bool  JpegDecoder::readData( Mat& img )
     {
         jpeg_decompress_struct* cinfo = &((JpegState*)m_state)->cinfo;
         JpegErrorMgr* jerr = &((JpegState*)m_state)->jerr;
+#ifndef JCS_EXTENSIONS
         JSAMPARRAY buffer = 0;
+#endif
 
         if( setjmp( jerr->setjmp_buffer ) == 0 )
         {
@@ -429,6 +431,18 @@ bool  JpegDecoder::readData( Mat& img )
             }
 #endif
 
+#ifdef JCS_EXTENSIONS
+            if( color )
+            {
+                cinfo->out_color_space = JCS_EXT_BGR;
+                cinfo->out_color_components = 3;
+            }
+            else
+            {
+                cinfo->out_color_space = JCS_GRAYSCALE;
+                cinfo->out_color_components = 1;
+            }
+#else
             if( color )
             {
                 if( cinfo->num_components != 4 )
@@ -455,6 +469,7 @@ bool  JpegDecoder::readData( Mat& img )
                     cinfo->out_color_components = 4;
                 }
             }
+#endif
 
             // Check for Exif marker APP1
             jpeg_saved_marker_ptr exif_marker = NULL;
@@ -481,12 +496,17 @@ bool  JpegDecoder::readData( Mat& img )
 
             jpeg_start_decompress( cinfo );
 
+#ifndef JCS_EXTENSIONS
             buffer = (*cinfo->mem->alloc_sarray)((j_common_ptr)cinfo,
                                               JPOOL_IMAGE, m_width*4, 1 );
+#endif
 
             uchar* data = img.ptr();
             for( ; m_height--; data += step )
             {
+#ifdef JCS_EXTENSIONS
+                jpeg_read_scanlines( cinfo, &data, 1 );
+#else
                 jpeg_read_scanlines( cinfo, buffer, 1 );
                 if( color )
                 {
@@ -502,6 +522,7 @@ bool  JpegDecoder::readData( Mat& img )
                     else
                         icvCvt_CMYK2Gray_8u_C4C1R( buffer[0], 0, data, 0, Size(m_width,1) );
                 }
+#endif
             }
 
             result = true;
@@ -593,8 +614,11 @@ bool JpegEncoder::write( const Mat& img, const std::vector<int>& params )
     int width = img.cols, height = img.rows;
 
     std::vector<uchar> out_buf(1 << 12);
+
+#ifndef JCS_EXTENSIONS
     AutoBuffer<uchar> _buffer;
     uchar* buffer;
+#endif
 
     struct jpeg_compress_struct cinfo;
     JpegErrorMgr jerr;
@@ -629,8 +653,15 @@ bool JpegEncoder::write( const Mat& img, const std::vector<int>& params )
 
         int _channels = img.channels();
         int channels = _channels > 1 ? 3 : 1;
+
+#ifdef JCS_EXTENSIONS
+        cinfo.input_components = _channels;
+        cinfo.in_color_space = _channels == 3 ? JCS_EXT_BGR
+            : _channels == 4 ? JCS_EXT_BGRX : JCS_GRAYSCALE;
+#else
         cinfo.input_components = channels;
         cinfo.in_color_space = channels > 1 ? JCS_RGB : JCS_GRAYSCALE;
+#endif
 
         int quality = 95;
         int progressive = 0;
@@ -746,14 +777,17 @@ bool JpegEncoder::write( const Mat& img, const std::vector<int>& params )
 
         jpeg_start_compress( &cinfo, TRUE );
 
+#ifndef JCS_EXTENSIONS
         if( channels > 1 )
             _buffer.allocate(width*channels);
         buffer = _buffer.data();
+#endif
 
         for( int y = 0; y < height; y++ )
         {
             uchar *data = img.data + img.step*y, *ptr = data;
 
+#ifndef JCS_EXTENSIONS
             if( _channels == 3 )
             {
                 icvCvt_BGR2RGB_8u_C3R( data, 0, buffer, 0, Size(width,1) );
@@ -764,6 +798,7 @@ bool JpegEncoder::write( const Mat& img, const std::vector<int>& params )
                 icvCvt_BGRA2BGR_8u_C4C3R( data, 0, buffer, 0, Size(width,1), 2 );
                 ptr = buffer;
             }
+#endif
 
             jpeg_write_scanlines( &cinfo, &ptr, 1 );
         }
diff --git a/modules/imgcodecs/src/loadsave.cpp b/modules/imgcodecs/src/loadsave.cpp
index 692c924480..2fd19830f4 100644
--- a/modules/imgcodecs/src/loadsave.cpp
+++ b/modules/imgcodecs/src/loadsave.cpp
@@ -210,15 +210,8 @@ struct ImageCodecInitializer
 static
 ImageCodecInitializer& getCodecs()
 {
-#ifdef CV_CXX11
     static ImageCodecInitializer g_codecs;
     return g_codecs;
-#else
-    // C++98 doesn't guarantee correctness of multi-threaded initialization of static global variables
-    // (memory leak here is not critical, use C++11 to avoid that)
-    static ImageCodecInitializer* g_codecs = new ImageCodecInitializer();
-    return *g_codecs;
-#endif
 }
 
 /**
diff --git a/modules/imgcodecs/test/test_avif.cpp b/modules/imgcodecs/test/test_avif.cpp
index 99b8f7769c..72b7f54fea 100644
--- a/modules/imgcodecs/test/test_avif.cpp
+++ b/modules/imgcodecs/test/test_avif.cpp
@@ -166,7 +166,7 @@ TEST_P(Imgcodecs_Avif_Image_EncodeDecodeSuite, imencode_imdecode) {
                  cv::Exception);
     return;
   }
-  bool result;
+  bool result = true;
   EXPECT_NO_THROW(
       result = cv::imencode(".avif", img_original, buf, encoding_params_););
   EXPECT_TRUE(result);
diff --git a/modules/imgproc/include/opencv2/imgproc.hpp b/modules/imgproc/include/opencv2/imgproc.hpp
index 26796e3c27..e5a9da64cf 100644
--- a/modules/imgproc/include/opencv2/imgproc.hpp
+++ b/modules/imgproc/include/opencv2/imgproc.hpp
@@ -4490,7 +4490,7 @@ An example using applyColorMap function
 
 /** @brief Applies a GNU Octave/MATLAB equivalent colormap on a given image.
 
-@param src The source image, grayscale or colored of type CV_8UC1 or CV_8UC3.
+@param src The source image, grayscale or colored of type CV_8UC1 or CV_8UC3. If CV_8UC3, then the CV_8UC1 image is generated internally using cv::COLOR_BGR2GRAY.
 @param dst The result is the colormapped source image. Note: Mat::create is called on dst.
 @param colormap The colormap to apply, see #ColormapTypes
 */
@@ -4498,8 +4498,8 @@ CV_EXPORTS_W void applyColorMap(InputArray src, OutputArray dst, int colormap);
 
 /** @brief Applies a user colormap on a given image.
 
-@param src The source image, grayscale or colored of type CV_8UC1 or CV_8UC3.
-@param dst The result is the colormapped source image. Note: Mat::create is called on dst.
+@param src The source image, grayscale or colored of type CV_8UC1 or CV_8UC3. If CV_8UC3, then the CV_8UC1 image is generated internally using cv::COLOR_BGR2GRAY.
+@param dst The result is the colormapped source image of the same number of channels as userColor. Note: Mat::create is called on dst.
 @param userColor The colormap to apply of type CV_8UC1 or CV_8UC3 and size 256
 */
 CV_EXPORTS_W void applyColorMap(InputArray src, OutputArray dst, InputArray userColor);
diff --git a/modules/java/CMakeLists.txt b/modules/java/CMakeLists.txt
index 7fe90a0cb3..7207997e1b 100644
--- a/modules/java/CMakeLists.txt
+++ b/modules/java/CMakeLists.txt
@@ -17,7 +17,7 @@ ocv_add_module(java BINDINGS opencv_core opencv_imgproc PRIVATE_REQUIRED opencv_
 
 include(${CMAKE_CURRENT_SOURCE_DIR}/common.cmake)
 
-# UTILITY: glob specific sources and append them to list (type is in H, CPP, JAVA, AIDL)
+# UTILITY: glob specific sources and append them to list (type is in H, CPP, JAVA)
 macro(glob_more_specific_sources _type _root _output)
   unset(_masks)
   if(${_type} STREQUAL "H")
@@ -26,8 +26,6 @@ macro(glob_more_specific_sources _type _root _output)
     set(_masks "${_root}/cpp/*.cpp")
   elseif(${_type} STREQUAL "JAVA")
     set(_masks "${_root}/java/*.java" "${_root}/java/*.java.in")
-  elseif(${_type} STREQUAL "AIDL")
-    set(_masks "${_root}/java/*.aidl")
   endif()
   if (_masks)
     file(GLOB _result ${_masks})
diff --git a/modules/java/android_sdk/android_gradle_lib/build.gradle b/modules/java/android_sdk/android_gradle_lib/build.gradle
index b887cdb4b9..4394bd9a4e 100644
--- a/modules/java/android_sdk/android_gradle_lib/build.gradle
+++ b/modules/java/android_sdk/android_gradle_lib/build.gradle
@@ -42,7 +42,6 @@ android {
         main {
             jniLibs.srcDirs = ['../../jni']
             java.srcDirs = ['src']  // TODO Use original files instead of copied into build directory
-            aidl.srcDirs = ['src']
             res.srcDirs = ['@OpenCV_SOURCE_DIR@/modules/java/android_sdk/android_gradle_lib/res']
             manifest.srcFile 'AndroidManifest.xml'
         }
diff --git a/modules/java/android_sdk/build.gradle.in b/modules/java/android_sdk/build.gradle.in
index ccaa1c1d0c..a14f042496 100644
--- a/modules/java/android_sdk/build.gradle.in
+++ b/modules/java/android_sdk/build.gradle.in
@@ -121,8 +121,6 @@ android {
         targetCompatibility JavaVersion.VERSION_@ANDROID_GRADLE_JAVA_VERSION_INIT@
     }
 
-    @ANDROID_GRADLE_BUILD_FEATURE_AIDL@
-
     buildTypes {
         debug {
             packagingOptions {
@@ -139,7 +137,6 @@ android {
     }
 
     buildFeatures {
-        aidl true
         prefabPublishing true
         buildConfig true
     }
@@ -153,7 +150,6 @@ android {
         main {
             jniLibs.srcDirs = ['native/libs']
             java.srcDirs = ['java/src']
-            aidl.srcDirs = ['java/src']
             res.srcDirs = ['java/res']
             manifest.srcFile 'java/AndroidManifest.xml'
         }
diff --git a/modules/java/generator/android-21/java/org/opencv/android/JavaCamera2View.java b/modules/java/generator/android-21/java/org/opencv/android/JavaCamera2View.java
index c899389e25..6447f07b82 100644
--- a/modules/java/generator/android-21/java/org/opencv/android/JavaCamera2View.java
+++ b/modules/java/generator/android-21/java/org/opencv/android/JavaCamera2View.java
@@ -46,6 +46,7 @@ public class JavaCamera2View extends CameraBridgeViewBase {
     protected ImageReader mImageReader;
     protected int mPreviewFormat = ImageFormat.YUV_420_888;
     protected int mRequestTemplate = CameraDevice.TEMPLATE_PREVIEW;
+    private int mFrameRotation;
 
     protected CameraDevice mCameraDevice;
     protected CameraCaptureSession mCaptureSession;
@@ -86,8 +87,8 @@ public class JavaCamera2View extends CameraBridgeViewBase {
         }
     }
 
-    protected boolean initializeCamera() {
-        Log.i(LOGTAG, "initializeCamera");
+    protected boolean selectCamera() {
+        Log.i(LOGTAG, "selectCamera");
         CameraManager manager = (CameraManager) getContext().getSystemService(Context.CAMERA_SERVICE);
         try {
             String camList[] = manager.getCameraIdList();
@@ -110,14 +111,10 @@ public class JavaCamera2View extends CameraBridgeViewBase {
                     }
                 }
             }
-            if (mCameraID != null) {
-                Log.i(LOGTAG, "Opening camera: " + mCameraID);
-                manager.openCamera(mCameraID, mStateCallback, mBackgroundHandler);
-            } else { // make JavaCamera2View behaves in the same way as JavaCameraView
-                Log.i(LOGTAG, "Trying to open camera with the value (" + mCameraIndex + ")");
+            if (mCameraID == null) { // make JavaCamera2View behaves in the same way as JavaCameraView
+                Log.i(LOGTAG, "Selecting camera by index (" + mCameraIndex + ")");
                 if (mCameraIndex < camList.length) {
                     mCameraID = camList[mCameraIndex];
-                    manager.openCamera(mCameraID, mStateCallback, mBackgroundHandler);
                 } else {
                     // CAMERA_DISCONNECTED is used when the camera id is no longer valid
                     throw new CameraAccessException(CameraAccessException.CAMERA_DISCONNECTED);
@@ -125,11 +122,11 @@ public class JavaCamera2View extends CameraBridgeViewBase {
             }
             return true;
         } catch (CameraAccessException e) {
-            Log.e(LOGTAG, "OpenCamera - Camera Access Exception", e);
+            Log.e(LOGTAG, "selectCamera - Camera Access Exception", e);
         } catch (IllegalArgumentException e) {
-            Log.e(LOGTAG, "OpenCamera - Illegal Argument Exception", e);
+            Log.e(LOGTAG, "selectCamera - Illegal Argument Exception", e);
         } catch (SecurityException e) {
-            Log.e(LOGTAG, "OpenCamera - Security Exception", e);
+            Log.e(LOGTAG, "selectCamera - Security Exception", e);
         }
         return false;
     }
@@ -204,6 +201,7 @@ public class JavaCamera2View extends CameraBridgeViewBase {
             mImageReader.setOnImageAvailableListener(new ImageReader.OnImageAvailableListener() {
                 @Override
                 public void onImageAvailable(ImageReader reader) {
+
                     Image image = reader.acquireLatestImage();
                     if (image == null)
                         return;
@@ -213,8 +211,9 @@ public class JavaCamera2View extends CameraBridgeViewBase {
                     assert (planes.length == 3);
                     assert (image.getFormat() == mPreviewFormat);
 
-                    JavaCamera2Frame tempFrame = new JavaCamera2Frame(image);
+                    RotatedCameraFrame tempFrame = new RotatedCameraFrame(new JavaCamera2Frame(image), mFrameRotation);
                     deliverAndDrawFrame(tempFrame);
+                    tempFrame.mFrame.release();
                     tempFrame.release();
                     image.close();
                 }
@@ -303,11 +302,22 @@ public class JavaCamera2View extends CameraBridgeViewBase {
     protected boolean connectCamera(int width, int height) {
         Log.i(LOGTAG, "setCameraPreviewSize(" + width + "x" + height + ")");
         startBackgroundThread();
-        initializeCamera();
+        selectCamera();
         try {
+            CameraManager manager = (CameraManager) getContext().getSystemService(Context.CAMERA_SERVICE);
+            CameraCharacteristics characteristics = manager.getCameraCharacteristics(mCameraID);
+            mFrameRotation = getFrameRotation(
+                    characteristics.get(CameraCharacteristics.LENS_FACING) == CameraCharacteristics.LENS_FACING_FRONT,
+                    characteristics.get(CameraCharacteristics.SENSOR_ORIENTATION));
+
             boolean needReconfig = calcPreviewSize(width, height);
-            mFrameWidth = mPreviewSize.getWidth();
-            mFrameHeight = mPreviewSize.getHeight();
+            if (mFrameRotation % 180 == 0) {
+                mFrameWidth = mPreviewSize.getWidth();
+                mFrameHeight = mPreviewSize.getHeight();
+            } else {
+                mFrameWidth = mPreviewSize.getHeight();
+                mFrameHeight = mPreviewSize.getWidth();
+            }
 
             if ((getLayoutParams().width == LayoutParams.MATCH_PARENT) && (getLayoutParams().height == LayoutParams.MATCH_PARENT))
                 mScale = Math.min(((float)height)/mFrameHeight, ((float)width)/mFrameWidth);
@@ -322,12 +332,16 @@ public class JavaCamera2View extends CameraBridgeViewBase {
                     mCaptureSession.close();
                     mCaptureSession = null;
                 }
-                createCameraPreviewSession();
             }
 
             if (mFpsMeter != null) {
                 mFpsMeter.setResolution(mFrameWidth, mFrameHeight);
             }
+
+            Log.i(LOGTAG, "Opening camera: " + mCameraID);
+            manager.openCamera(mCameraID, mStateCallback, mBackgroundHandler);
+        } catch (CameraAccessException e) {
+            Log.e(LOGTAG, "OpenCamera - Camera Access Exception", e);
         } catch (RuntimeException e) {
             throw new RuntimeException("Interrupted while setCameraPreviewSize.", e);
         }
@@ -442,6 +456,7 @@ public class JavaCamera2View extends CameraBridgeViewBase {
             mGray = new Mat();
         }
 
+        @Override
         public void release() {
             mRgba.release();
             mGray.release();
diff --git a/modules/java/generator/android-24/java/org/opencv/android/NativeCameraView.java b/modules/java/generator/android-24/java/org/opencv/android/NativeCameraView.java
index 44ed8c4114..b28c2121cd 100644
--- a/modules/java/generator/android-24/java/org/opencv/android/NativeCameraView.java
+++ b/modules/java/generator/android-24/java/org/opencv/android/NativeCameraView.java
@@ -10,6 +10,7 @@ import org.opencv.videoio.VideoCapture;
 import org.opencv.videoio.VideoWriter;
 
 import android.content.Context;
+import android.hardware.Camera;
 import android.util.AttributeSet;
 import android.util.Log;
 import android.view.ViewGroup.LayoutParams;
@@ -25,7 +26,7 @@ public class NativeCameraView extends CameraBridgeViewBase {
     private Thread mThread;
 
     protected VideoCapture mCamera;
-    protected NativeCameraFrame mFrame;
+    protected RotatedCameraFrame mFrame;
 
     public NativeCameraView(Context context, int cameraId) {
         super(context, cameraId);
@@ -89,28 +90,65 @@ public class NativeCameraView extends CameraBridgeViewBase {
 
     private boolean initializeCamera(int width, int height) {
         synchronized (this) {
-
-            if (mCameraIndex == -1) {
+            Camera.CameraInfo cameraInfo = new Camera.CameraInfo();
+            int localCameraIndex = mCameraIndex;
+            if (mCameraIndex == CAMERA_ID_ANY) {
                 Log.d(TAG, "Try to open default camera");
-                mCamera = new VideoCapture(0, Videoio.CAP_ANDROID);
-            } else {
-                Log.d(TAG, "Try to open camera with index " + mCameraIndex);
-                mCamera = new VideoCapture(mCameraIndex, Videoio.CAP_ANDROID);
+                localCameraIndex = 0;
+            } else if (mCameraIndex == CAMERA_ID_BACK) {
+                Log.i(TAG, "Trying to open back camera");
+                for (int camIdx = 0; camIdx < Camera.getNumberOfCameras(); ++camIdx) {
+                    Camera.getCameraInfo( camIdx, cameraInfo );
+                    if (cameraInfo.facing == Camera.CameraInfo.CAMERA_FACING_BACK) {
+                        localCameraIndex = camIdx;
+                        break;
+                    }
+                }
+            } else if (mCameraIndex == CAMERA_ID_FRONT) {
+                Log.i(TAG, "Trying to open front camera");
+                for (int camIdx = 0; camIdx < Camera.getNumberOfCameras(); ++camIdx) {
+                    Camera.getCameraInfo( camIdx, cameraInfo );
+                    if (cameraInfo.facing == Camera.CameraInfo.CAMERA_FACING_FRONT) {
+                        localCameraIndex = camIdx;
+                        break;
+                    }
+                }
             }
 
+            if (localCameraIndex == CAMERA_ID_BACK) {
+                Log.e(TAG, "Back camera not found!");
+                return false;
+            } else if (localCameraIndex == CAMERA_ID_FRONT) {
+                Log.e(TAG, "Front camera not found!");
+                return false;
+            }
+
+            Log.d(TAG, "Try to open camera with index " + localCameraIndex);
+            mCamera = new VideoCapture(localCameraIndex, Videoio.CAP_ANDROID);
+
             if (mCamera == null)
                 return false;
-
             if (mCamera.isOpened() == false)
                 return false;
 
-            mFrame = new NativeCameraFrame(mCamera);
+            if (mCameraIndex != CAMERA_ID_BACK && mCameraIndex != CAMERA_ID_FRONT)
+                Camera.getCameraInfo(localCameraIndex, cameraInfo);
+            int frameRotation = getFrameRotation(
+                    cameraInfo.facing == Camera.CameraInfo.CAMERA_FACING_FRONT,
+                    cameraInfo.orientation);
+
+            mFrame = new RotatedCameraFrame(new NativeCameraFrame(mCamera), frameRotation);
 
             mCamera.set(Videoio.CAP_PROP_FRAME_WIDTH, width);
             mCamera.set(Videoio.CAP_PROP_FRAME_HEIGHT, height);
 
-            mFrameWidth = (int)mCamera.get(Videoio.CAP_PROP_FRAME_WIDTH);
-            mFrameHeight = (int)mCamera.get(Videoio.CAP_PROP_FRAME_HEIGHT);
+            if (frameRotation % 180 == 0) {
+                mFrameWidth = (int) mCamera.get(Videoio.CAP_PROP_FRAME_WIDTH);
+                mFrameHeight = (int) mCamera.get(Videoio.CAP_PROP_FRAME_HEIGHT);
+            } else {
+                mFrameWidth = (int) mCamera.get(Videoio.CAP_PROP_FRAME_HEIGHT);
+                mFrameHeight = (int) mCamera.get(Videoio.CAP_PROP_FRAME_WIDTH);
+            }
 
             if ((getLayoutParams().width == LayoutParams.MATCH_PARENT) && (getLayoutParams().height == LayoutParams.MATCH_PARENT))
                 mScale = Math.min(((float)height)/mFrameHeight, ((float)width)/mFrameWidth);
@@ -131,7 +169,10 @@ public class NativeCameraView extends CameraBridgeViewBase {
 
     private void releaseCamera() {
         synchronized (this) {
-            if (mFrame != null) mFrame.release();
+            if (mFrame != null) {
+                mFrame.mFrame.release();
+                mFrame.release();
+            }
             if (mCamera != null) mCamera.release();
         }
     }
@@ -162,6 +203,7 @@ public class NativeCameraView extends CameraBridgeViewBase {
             mBgr = new Mat();
         }
 
+        @Override
         public void release() {
             if (mGray != null) mGray.release();
             if (mRgba != null) mRgba.release();
diff --git a/modules/java/generator/android/java/org/opencv/android/CameraBridgeViewBase.java b/modules/java/generator/android/java/org/opencv/android/CameraBridgeViewBase.java
index 1993cf1407..4aa6a350f8 100644
--- a/modules/java/generator/android/java/org/opencv/android/CameraBridgeViewBase.java
+++ b/modules/java/generator/android/java/org/opencv/android/CameraBridgeViewBase.java
@@ -4,6 +4,7 @@ import java.util.List;
 
 import org.opencv.BuildConfig;
 import org.opencv.R;
+import org.opencv.core.Core;
 import org.opencv.core.Mat;
 import org.opencv.core.Size;
 
@@ -17,8 +18,10 @@ import android.graphics.Canvas;
 import android.graphics.Rect;
 import android.util.AttributeSet;
 import android.util.Log;
+import android.view.Surface;
 import android.view.SurfaceHolder;
 import android.view.SurfaceView;
+import android.view.WindowManager;
 
 /**
  * This is a basic class, implementing the interaction with Camera and OpenCV library.
@@ -189,8 +192,93 @@ public abstract class CameraBridgeViewBase extends SurfaceView implements Surfac
          * This method returns single channel gray scale Mat with frame
          */
         public Mat gray();
+
+        public void release();
     };
 
+    public class RotatedCameraFrame implements CvCameraViewFrame {
+        @Override
+        public Mat gray() {
+            if (mRotation != 0) {
+                Core.rotate(mFrame.gray(), mGrayRotated, getCvRotationCode(mRotation));
+                return mGrayRotated;
+            } else {
+                return mFrame.gray();
+            }
+        }
+
+        @Override
+        public Mat rgba() {
+            if (mRotation != 0) {
+                Core.rotate(mFrame.rgba(), mRgbaRotated, getCvRotationCode(mRotation));
+                return mRgbaRotated;
+            } else {
+                return mFrame.rgba();
+            }
+        }
+
+        private int getCvRotationCode(int degrees) {
+            if  (degrees == 90) {
+                return Core.ROTATE_90_CLOCKWISE;
+            } else if (degrees == 180) {
+                return Core.ROTATE_180;
+            } else {
+                return Core.ROTATE_90_COUNTERCLOCKWISE;
+            }
+        }
+
+        public RotatedCameraFrame(CvCameraViewFrame frame, int rotation) {
+            super();
+            mFrame = frame;
+            mRgbaRotated = new Mat();
+            mGrayRotated = new Mat();
+            mRotation = rotation;
+        }
+
+        @Override
+        public void release() {
+            mRgbaRotated.release();
+            mGrayRotated.release();
+        }
+
+        public CvCameraViewFrame mFrame;
+        private Mat mRgbaRotated;
+        private Mat mGrayRotated;
+        private int mRotation;
+    };
+
+    /**
+     * Calculates how to rotate camera frame to match current screen orientation
+     */
+    protected int getFrameRotation(boolean cameraFacingFront, int cameraSensorOrientation) {
+        WindowManager windowManager = (WindowManager) getContext().getSystemService(Context.WINDOW_SERVICE);
+        int screenOrientation = windowManager.getDefaultDisplay().getRotation();
+        int screenRotation = 0;
+        switch (screenOrientation) {
+            case Surface.ROTATION_0:
+                screenRotation = 0;
+                break;
+            case Surface.ROTATION_90:
+                screenRotation = 90;
+                break;
+            case Surface.ROTATION_180:
+                screenRotation = 180;
+                break;
+            case Surface.ROTATION_270:
+                screenRotation = 270;
+                break;
+        }
+
+        int frameRotation;
+        if (cameraFacingFront) {
+            frameRotation = (cameraSensorOrientation + screenRotation) % 360;
+        } else {
+            frameRotation = (cameraSensorOrientation - screenRotation + 360) % 360;
+        }
+
+        return frameRotation;
+    }
+
     public void surfaceChanged(SurfaceHolder arg0, int arg1, int arg2, int arg3) {
         Log.d(TAG, "call surfaceChanged event");
         synchronized(mSyncObject) {
diff --git a/modules/java/generator/android/java/org/opencv/android/JavaCameraView.java b/modules/java/generator/android/java/org/opencv/android/JavaCameraView.java
index a7c72e43f0..b76f186101 100644
--- a/modules/java/generator/android/java/org/opencv/android/JavaCameraView.java
+++ b/modules/java/generator/android/java/org/opencv/android/JavaCameraView.java
@@ -10,9 +10,12 @@ import android.hardware.Camera.PreviewCallback;
 import android.os.Build;
 import android.util.AttributeSet;
 import android.util.Log;
+import android.view.Surface;
 import android.view.ViewGroup.LayoutParams;
+import android.view.WindowManager;
 
 import org.opencv.BuildConfig;
+import org.opencv.core.Core;
 import org.opencv.core.CvType;
 import org.opencv.core.Mat;
 import org.opencv.core.Size;
@@ -39,7 +42,7 @@ public class JavaCameraView extends CameraBridgeViewBase implements PreviewCallb
     private boolean mStopThread;
 
     protected Camera mCamera;
-    protected JavaCameraFrame[] mCameraFrame;
+    protected RotatedCameraFrame[] mCameraFrame;
     private SurfaceTexture mSurfaceTexture;
     private int mPreviewFormat = ImageFormat.NV21;
 
@@ -71,28 +74,20 @@ public class JavaCameraView extends CameraBridgeViewBase implements PreviewCallb
         boolean result = true;
         synchronized (this) {
             mCamera = null;
+            int cameraId = -1;
 
             if (mCameraIndex == CAMERA_ID_ANY) {
-                Log.d(TAG, "Trying to open camera with old open()");
-                try {
-                    mCamera = Camera.open();
-                }
-                catch (Exception e){
-                    Log.e(TAG, "Camera is not available (in use or does not exist): " + e.getLocalizedMessage());
-                }
-
-                if(mCamera == null && Build.VERSION.SDK_INT >= Build.VERSION_CODES.GINGERBREAD) {
-                    boolean connected = false;
-                    for (int camIdx = 0; camIdx < Camera.getNumberOfCameras(); ++camIdx) {
-                        Log.d(TAG, "Trying to open camera with new open(" + Integer.valueOf(camIdx) + ")");
-                        try {
-                            mCamera = Camera.open(camIdx);
-                            connected = true;
-                        } catch (RuntimeException e) {
-                            Log.e(TAG, "Camera #" + camIdx + "failed to open: " + e.getLocalizedMessage());
-                        }
-                        if (connected) break;
+                boolean connected = false;
+                for (int camIdx = 0; camIdx < Camera.getNumberOfCameras(); ++camIdx) {
+                    Log.d(TAG, "Trying to open camera with new open(" + Integer.valueOf(camIdx) + ")");
+                    try {
+                        mCamera = Camera.open(camIdx);
+                        connected = true;
+                        cameraId = camIdx;
+                    } catch (RuntimeException e) {
+                        Log.e(TAG, "Camera #" + camIdx + "failed to open: " + e.getLocalizedMessage());
                     }
+                    if (connected) break;
                 }
             } else {
                 if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.GINGERBREAD) {
@@ -126,6 +121,7 @@ public class JavaCameraView extends CameraBridgeViewBase implements PreviewCallb
                         Log.d(TAG, "Trying to open camera with new open(" + Integer.valueOf(localCameraIndex) + ")");
                         try {
                             mCamera = Camera.open(localCameraIndex);
+                            cameraId = localCameraIndex;
                         } catch (RuntimeException e) {
                             Log.e(TAG, "Camera #" + localCameraIndex + "failed to open: " + e.getLocalizedMessage());
                         }
@@ -136,6 +132,11 @@ public class JavaCameraView extends CameraBridgeViewBase implements PreviewCallb
             if (mCamera == null)
                 return false;
 
+            android.hardware.Camera.CameraInfo info = new android.hardware.Camera.CameraInfo();
+            android.hardware.Camera.getCameraInfo(cameraId, info);
+            int frameRotation = getFrameRotation(
+                    info.facing == Camera.CameraInfo.CAMERA_FACING_FRONT,
+                    info.orientation);
             /* Now set camera parameters */
             try {
                 Camera.Parameters params = mCamera.getParameters();
@@ -176,8 +177,16 @@ public class JavaCameraView extends CameraBridgeViewBase implements PreviewCallb
                     mCamera.setParameters(params);
                     params = mCamera.getParameters();
 
-                    mFrameWidth = params.getPreviewSize().width;
-                    mFrameHeight = params.getPreviewSize().height;
+                    int rawFrameWidth = params.getPreviewSize().width;
+                    int rawFrameHeight = params.getPreviewSize().height;
+
+                    if (frameRotation % 180 == 0) {
+                        mFrameWidth = params.getPreviewSize().width;
+                        mFrameHeight = params.getPreviewSize().height;
+                    } else {
+                        mFrameWidth = params.getPreviewSize().height;
+                        mFrameHeight = params.getPreviewSize().width;
+                    }
 
                     if ((getLayoutParams().width == LayoutParams.MATCH_PARENT) && (getLayoutParams().height == LayoutParams.MATCH_PARENT))
                         mScale = Math.min(((float)height)/mFrameHeight, ((float)width)/mFrameWidth);
@@ -196,14 +205,14 @@ public class JavaCameraView extends CameraBridgeViewBase implements PreviewCallb
                     mCamera.setPreviewCallbackWithBuffer(this);
 
                     mFrameChain = new Mat[2];
-                    mFrameChain[0] = new Mat(mFrameHeight + (mFrameHeight/2), mFrameWidth, CvType.CV_8UC1);
-                    mFrameChain[1] = new Mat(mFrameHeight + (mFrameHeight/2), mFrameWidth, CvType.CV_8UC1);
+                    mFrameChain[0] = new Mat(rawFrameHeight + (rawFrameHeight/2), rawFrameWidth, CvType.CV_8UC1);
+                    mFrameChain[1] = new Mat(rawFrameHeight + (rawFrameHeight/2), rawFrameWidth, CvType.CV_8UC1);
 
                     AllocateCache();
 
-                    mCameraFrame = new JavaCameraFrame[2];
-                    mCameraFrame[0] = new JavaCameraFrame(mFrameChain[0], mFrameWidth, mFrameHeight);
-                    mCameraFrame[1] = new JavaCameraFrame(mFrameChain[1], mFrameWidth, mFrameHeight);
+                    mCameraFrame = new RotatedCameraFrame[2];
+                    mCameraFrame[0] = new RotatedCameraFrame(new JavaCameraFrame(mFrameChain[0], rawFrameWidth, rawFrameHeight), frameRotation);
+                    mCameraFrame[1] = new RotatedCameraFrame(new JavaCameraFrame(mFrameChain[1], rawFrameWidth, rawFrameHeight), frameRotation);
 
                     if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.HONEYCOMB) {
                         mSurfaceTexture = new SurfaceTexture(MAGIC_TEXTURE_ID);
@@ -240,7 +249,9 @@ public class JavaCameraView extends CameraBridgeViewBase implements PreviewCallb
                 mFrameChain[1].release();
             }
             if (mCameraFrame != null) {
+                mCameraFrame[0].mFrame.release();
                 mCameraFrame[0].release();
+                mCameraFrame[1].mFrame.release();
                 mCameraFrame[1].release();
             }
         }
@@ -336,6 +347,7 @@ public class JavaCameraView extends CameraBridgeViewBase implements PreviewCallb
             mRgba = new Mat();
         }
 
+        @Override
         public void release() {
             mRgba.release();
         }
diff --git a/modules/java/generator/gen_java.py b/modules/java/generator/gen_java.py
index ba5ed49416..16b020465a 100755
--- a/modules/java/generator/gen_java.py
+++ b/modules/java/generator/gen_java.py
@@ -1254,13 +1254,13 @@ JNIEXPORT void JNICALL Java_org_opencv_%(module)s_%(j_cls)s_delete
 def copy_java_files(java_files_dir, java_base_path, default_package_path='org/opencv/'):
     global total_files, updated_files
     java_files = []
-    re_filter = re.compile(r'^.+\.(java|aidl|kt)(.in)?$')
+    re_filter = re.compile(r'^.+\.(java|kt)(.in)?$')
     for root, dirnames, filenames in os.walk(java_files_dir):
        java_files += [os.path.join(root, filename) for filename in filenames if re_filter.match(filename)]
     java_files = [f.replace('\\', '/') for f in java_files]
 
     re_package = re.compile(r'^package +(.+);')
-    re_prefix = re.compile(r'^.+[\+/]([^\+]+).(java|aidl|kt)(.in)?$')
+    re_prefix = re.compile(r'^.+[\+/]([^\+]+).(java|kt)(.in)?$')
     for java_file in java_files:
         src = checkFileRemap(java_file)
         with open(src, 'r') as f:
diff --git a/modules/objdetect/include/opencv2/objdetect/barcode.hpp b/modules/objdetect/include/opencv2/objdetect/barcode.hpp
index 958490a422..788889ad40 100644
--- a/modules/objdetect/include/opencv2/objdetect/barcode.hpp
+++ b/modules/objdetect/include/opencv2/objdetect/barcode.hpp
@@ -27,7 +27,7 @@ public:
      * @param prototxt_path prototxt file path for the super resolution model
      * @param model_path model file path for the super resolution model
      */
-    CV_WRAP BarcodeDetector(const std::string &prototxt_path, const std::string &model_path);
+    CV_WRAP BarcodeDetector(CV_WRAP_FILE_PATH const std::string &prototxt_path, CV_WRAP_FILE_PATH const std::string &model_path);
     ~BarcodeDetector();
 
     /** @brief Decodes barcode in image once it's found by the detect() method.
diff --git a/modules/objdetect/include/opencv2/objdetect/face.hpp b/modules/objdetect/include/opencv2/objdetect/face.hpp
index 9b53f83128..bfa04cbd16 100644
--- a/modules/objdetect/include/opencv2/objdetect/face.hpp
+++ b/modules/objdetect/include/opencv2/objdetect/face.hpp
@@ -82,8 +82,8 @@ public:
      *  @param backend_id the id of backend
      *  @param target_id the id of target device
      */
-    CV_WRAP static Ptr<FaceDetectorYN> create(const String& model,
-                                              const String& config,
+    CV_WRAP static Ptr<FaceDetectorYN> create(CV_WRAP_FILE_PATH const String& model,
+                                              CV_WRAP_FILE_PATH const String& config,
                                               const Size& input_size,
                                               float score_threshold = 0.9f,
                                               float nms_threshold = 0.3f,
@@ -154,7 +154,7 @@ public:
      *  @param backend_id the id of backend
      *  @param target_id the id of target device
      */
-    CV_WRAP static Ptr<FaceRecognizerSF> create(const String& model, const String& config, int backend_id = 0, int target_id = 0);
+    CV_WRAP static Ptr<FaceRecognizerSF> create(CV_WRAP_FILE_PATH const String& model, CV_WRAP_FILE_PATH const String& config, int backend_id = 0, int target_id = 0);
 };
 
 //! @}
diff --git a/modules/objdetect/src/aruco/aruco_board.cpp b/modules/objdetect/src/aruco/aruco_board.cpp
index 3d4217e02a..f8d3d3c108 100644
--- a/modules/objdetect/src/aruco/aruco_board.cpp
+++ b/modules/objdetect/src/aruco/aruco_board.cpp
@@ -483,39 +483,44 @@ void CharucoBoardImpl::generateImage(Size outSize, OutputArray img, int marginSi
     Mat noMarginsImg =
         out.colRange(marginSize, out.cols - marginSize).rowRange(marginSize, out.rows - marginSize);
 
-    double totalLengthX, totalLengthY;
-    totalLengthX = squareLength * size.width;
-    totalLengthY = squareLength * size.height;
-
-    // proportional transformation
-    double xReduction = totalLengthX / double(noMarginsImg.cols);
-    double yReduction = totalLengthY / double(noMarginsImg.rows);
+    // the size of the chessboard square depends on the location of the chessboard
+    float pixInSquare = 0.f;
+    // the size of the chessboard in pixels
+    Size pixInChessboard(noMarginsImg.cols, noMarginsImg.rows);
 
     // determine the zone where the chessboard is placed
-    Mat chessboardZoneImg;
-    if(xReduction > yReduction) {
-        int nRows = int(totalLengthY / xReduction);
-        int rowsMargins = (noMarginsImg.rows - nRows) / 2;
-        chessboardZoneImg = noMarginsImg.rowRange(rowsMargins, noMarginsImg.rows - rowsMargins);
-    } else {
-        int nCols = int(totalLengthX / yReduction);
-        int colsMargins = (noMarginsImg.cols - nCols) / 2;
-        chessboardZoneImg = noMarginsImg.colRange(colsMargins, noMarginsImg.cols - colsMargins);
+    float pixInSquareX = (float)noMarginsImg.cols / (float)size.width;
+    float pixInSquareY = (float)noMarginsImg.rows / (float)size.height;
+    Point startChessboard(0, 0);
+    if (pixInSquareX <= pixInSquareY) {
+        // the width of "noMarginsImg" image determines the dimensions of the chessboard
+        pixInSquare = pixInSquareX;
+        pixInChessboard.height = cvRound(pixInSquare*size.height);
+        int rowsMargin = (noMarginsImg.rows - pixInChessboard.height) / 2;
+        startChessboard.y = rowsMargin;
     }
+    else {
+        // the height of "noMarginsImg" image determines the dimensions of the chessboard
+        pixInSquare = pixInSquareY;
+        pixInChessboard.width = cvRound(pixInSquare*size.width);
+        int colsMargin = (noMarginsImg.cols - pixInChessboard.width) / 2;
+        startChessboard.x = colsMargin;
+    }
+    // determine the zone where the chessboard is located
+    Mat chessboardZoneImg = noMarginsImg(Rect(startChessboard, pixInChessboard));
 
-    // determine the margins to draw only the markers
-    // take the minimum just to be sure
-    double squareSizePixels = min(double(chessboardZoneImg.cols) / double(size.width),
-                                  double(chessboardZoneImg.rows) / double(size.height));
+    // marker size in pixels
+    const float pixInMarker = markerLength/squareLength*pixInSquare;
+    // the size of the marker margin in pixels
+    const float pixInMarginMarker = 0.5f*(pixInSquare - pixInMarker);
 
-    double diffSquareMarkerLength = (squareLength - markerLength) / 2;
-    int diffSquareMarkerLengthPixels =
-        int(diffSquareMarkerLength * squareSizePixels / squareLength);
+    // determine the zone where the aruco markers are located
+    int endArucoX = cvRound(pixInSquare*(size.width-1)+pixInMarginMarker+pixInMarker);
+    int endArucoY = cvRound(pixInSquare*(size.height-1)+pixInMarginMarker+pixInMarker);
+    Mat arucoZone = chessboardZoneImg(Range(cvRound(pixInMarginMarker), endArucoY), Range(cvRound(pixInMarginMarker), endArucoX));
 
     // draw markers
-    Mat markersImg;
-    Board::Impl::generateImage(chessboardZoneImg.size(), markersImg, diffSquareMarkerLengthPixels, borderBits);
-    markersImg.copyTo(chessboardZoneImg);
+    Board::Impl::generateImage(arucoZone.size(), arucoZone, 0, borderBits);
 
     // now draw black squares
     for(int y = 0; y < size.height; y++) {
@@ -527,12 +532,11 @@ void CharucoBoardImpl::generateImage(Size outSize, OutputArray img, int marginSi
                 if(y % 2 != x % 2) continue; // white corner, dont do anything
             }
 
-            double startX, startY;
-            startX = squareSizePixels * double(x);
-            startY = squareSizePixels * double(y);
+            float startX = pixInSquare * float(x);
+            float startY = pixInSquare * float(y);
 
-            Mat squareZone = chessboardZoneImg.rowRange(int(startY), int(startY + squareSizePixels))
-                                 .colRange(int(startX), int(startX + squareSizePixels));
+            Mat squareZone = chessboardZoneImg(Range(cvRound(startY), cvRound(startY + pixInSquare)),
+                                               Range(cvRound(startX), cvRound(startX + pixInSquare)));
 
             squareZone.setTo(0);
         }
diff --git a/modules/objdetect/src/aruco/aruco_detector.cpp b/modules/objdetect/src/aruco/aruco_detector.cpp
index 71fdc17182..8ed6398ebb 100644
--- a/modules/objdetect/src/aruco/aruco_detector.cpp
+++ b/modules/objdetect/src/aruco/aruco_detector.cpp
@@ -684,7 +684,7 @@ struct ArucoDetector::ArucoDetectorImpl {
         contours.clear();
 
         // sort candidates from big to small
-        std::sort(candidateTree.begin(), candidateTree.end());
+        std::stable_sort(candidateTree.begin(), candidateTree.end());
         // group index for each candidate
         vector<int> groupId(candidateTree.size(), -1);
         vector<vector<size_t> > groupedCandidates;
@@ -728,11 +728,11 @@ struct ArucoDetector::ArucoDetectorImpl {
 
         for (vector<size_t>& grouped : groupedCandidates) {
             if (detectorParams.detectInvertedMarker) // if detectInvertedMarker choose smallest contours
-                std::sort(grouped.begin(), grouped.end(), [](const size_t &a, const size_t &b) {
+                std::stable_sort(grouped.begin(), grouped.end(), [](const size_t &a, const size_t &b) {
                     return a > b;
                 });
             else // if detectInvertedMarker==false choose largest contours
-                std::sort(grouped.begin(), grouped.end());
+                std::stable_sort(grouped.begin(), grouped.end());
             size_t currId = grouped[0];
             isSelectedContours[currId] = true;
             for (size_t i = 1ull; i < grouped.size(); i++) {
@@ -780,7 +780,7 @@ struct ArucoDetector::ArucoDetectorImpl {
         vector<int> idsTmp(ncandidates, -1);
         vector<int> rotated(ncandidates, 0);
         vector<uint8_t> validCandidates(ncandidates, 0);
-        vector<bool> was(ncandidates, false);
+        vector<uint8_t> was(ncandidates, false);
         bool checkCloseContours = true;
 
         int maxDepth = 0;
diff --git a/modules/objdetect/src/precomp.hpp b/modules/objdetect/src/precomp.hpp
index 790a980697..63ca440076 100644
--- a/modules/objdetect/src/precomp.hpp
+++ b/modules/objdetect/src/precomp.hpp
@@ -52,5 +52,7 @@
 #include "opencv2/core/private.hpp"
 
 #include <numeric>
+#include <array>
+#include <vector>
 
 #endif
diff --git a/modules/objdetect/src/qrcode.cpp b/modules/objdetect/src/qrcode.cpp
index 37f276085f..cf5c5c7b18 100644
--- a/modules/objdetect/src/qrcode.cpp
+++ b/modules/objdetect/src/qrcode.cpp
@@ -15,6 +15,7 @@
 #include "quirc.h"
 #endif
 
+#include <array>
 #include <limits>
 #include <cmath>
 #include <queue>
diff --git a/modules/objdetect/test/test_charucodetection.cpp b/modules/objdetect/test/test_charucodetection.cpp
index 89608213be..ea3252a22a 100644
--- a/modules/objdetect/test/test_charucodetection.cpp
+++ b/modules/objdetect/test/test_charucodetection.cpp
@@ -771,6 +771,57 @@ TEST_P(CharucoBoard, testWrongSizeDetection)
     ASSERT_TRUE(detectedCharucoIds.empty());
 }
 
+TEST(CharucoBoardGenerate, issue_24806)
+{
+    aruco::Dictionary dict = aruco::getPredefinedDictionary(aruco::DICT_4X4_1000);
+    const float squareLength = 13.f, markerLength = 10.f;
+    const Size boardSize(7ull, 4ull);
+    const aruco::CharucoBoard board(boardSize, squareLength, markerLength, dict);
+    const int marginSize = 24;
+    Mat boardImg;
+
+    // generate chessboard image
+    board.generateImage(Size(400, 300), boardImg, marginSize);
+    // This condition checks that the width of the image determines the dimensions of the chessboard in this test
+    CV_Assert((float)(boardImg.cols) / (float)boardSize.width <=
+              (float)(boardImg.rows) / (float)boardSize.height);
+
+    // prepare data for chessboard image test
+    Mat noMarginsImg = boardImg(Range(marginSize, boardImg.rows - marginSize),
+                                Range(marginSize, boardImg.cols - marginSize));
+    const float pixInSquare = (float)(noMarginsImg.cols) / (float)boardSize.width;
+
+    Size pixInChessboard(cvRound(pixInSquare*boardSize.width), cvRound(pixInSquare*boardSize.height));
+    const Point startChessboard((noMarginsImg.cols - pixInChessboard.width) / 2,
+                                (noMarginsImg.rows - pixInChessboard.height) / 2);
+    Mat chessboardZoneImg = noMarginsImg(Rect(startChessboard, pixInChessboard));
+
+    // B - black pixel, W - white pixel
+    // chessboard corner 1:
+    // B W
+    // W B
+    Mat goldCorner1 = (Mat_<uint8_t>(2, 2) <<
+        0, 255,
+        255, 0);
+    // B - black pixel, W - white pixel
+    // chessboard corner 2:
+    // W B
+    // B W
+    Mat goldCorner2 = (Mat_<uint8_t>(2, 2) <<
+        255, 0,
+        0, 255);
+
+    // test chessboard corners in generated image
+    for (const Point3f& p: board.getChessboardCorners()) {
+        Point2f chessCorner(pixInSquare*(p.x/squareLength),
+                            pixInSquare*(p.y/squareLength));
+        Mat winCorner = chessboardZoneImg(Rect(Point(cvRound(chessCorner.x) - 1, cvRound(chessCorner.y) - 1), Size(2, 2)));
+        bool eq = (cv::countNonZero(goldCorner1 != winCorner) == 0) | (cv::countNonZero(goldCorner2 != winCorner) == 0);
+        ASSERT_TRUE(eq);
+    }
+    // TODO: fix aruco generateImage and add test aruco corners for generated image
+}
+
 // Temporary disabled in https://github.com/opencv/opencv/pull/24338
 // 5.x version produces conrnes with different shape than 4.x (32F_C2 instead of 2x 32FC1)
 TEST(Charuco, DISABLED_testSeveralBoardsWithCustomIds)
diff --git a/modules/objdetect/test/test_precomp.hpp b/modules/objdetect/test/test_precomp.hpp
index 88b8e9a4f5..452a0d78d6 100644
--- a/modules/objdetect/test/test_precomp.hpp
+++ b/modules/objdetect/test/test_precomp.hpp
@@ -7,10 +7,6 @@
 #include "opencv2/ts.hpp"
 #include "opencv2/objdetect.hpp"
 
-#if defined CV_CXX11
-  #include <random>
-#else
-  #include <cstdlib>
-#endif
+#include <random>
 
 #endif
diff --git a/modules/objdetect/test/test_qrcode_encode.cpp b/modules/objdetect/test/test_qrcode_encode.cpp
index 7f5eb37f09..45567b5d9b 100644
--- a/modules/objdetect/test/test_qrcode_encode.cpp
+++ b/modules/objdetect/test/test_qrcode_encode.cpp
@@ -5,16 +5,6 @@
 #include "test_precomp.hpp"
 namespace opencv_test { namespace {
 
-#if !defined CV_CXX11
-// Wrapper for generating seeded random number via std::rand.
-template<unsigned Seed>
-class SeededRandFunctor {
-public:
-    SeededRandFunctor() { std::srand(Seed); }
-    int operator()(int i) { return std::rand() % (i + 1); }
-};
-#endif
-
 std::string encode_qrcode_images_name[] = {
         "version1_mode1.png", "version1_mode2.png", "version1_mode4.png",
         "version2_mode1.png", "version2_mode2.png", "version2_mode4.png",
diff --git a/modules/python/common.cmake b/modules/python/common.cmake
index a233fe0232..cd6c27984a 100644
--- a/modules/python/common.cmake
+++ b/modules/python/common.cmake
@@ -46,6 +46,7 @@ if(${PYTHON}_LIMITED_API)
   # support only python3.3+
   ocv_assert(${PYTHON}_VERSION_MAJOR EQUAL 3 AND ${PYTHON}_VERSION_MINOR GREATER 2)
   target_compile_definitions(${the_module} PRIVATE CVPY_DYNAMIC_INIT)
+  target_compile_definitions(${the_module} PRIVATE PYTHON3_LIMITED_API_VERSION=${PYTHON3_LIMITED_API_VERSION})
   if(WIN32)
     string(REPLACE
       "python${${PYTHON}_VERSION_MAJOR}${${PYTHON}_VERSION_MINOR}.lib"
diff --git a/modules/python/python3/CMakeLists.txt b/modules/python/python3/CMakeLists.txt
index d95af21e04..da86ba5c5e 100644
--- a/modules/python/python3/CMakeLists.txt
+++ b/modules/python/python3/CMakeLists.txt
@@ -2,15 +2,6 @@ if(NOT PYTHON3_INCLUDE_PATH OR NOT PYTHON3_NUMPY_INCLUDE_DIRS)
   ocv_module_disable(python3)
 endif()
 
-# Problem in numpy >=1.15 <1.17
-if(PYTHON3_LIMITED_API
-    AND NOT PYTHON3_NUMPY_VERSION VERSION_LESS "1.15"
-    AND PYTHON3_NUMPY_VERSION VERSION_LESS "1.17"
-  )
-  message(WARNING "Current NUMPY version (${PYTHON3_NUMPY_VERSION}) is not compatible with LIMITED_API.")
-  set(PYTHON3_LIMITED_API OFF)
-endif()
-
 set(the_description "The python3 bindings")
 set(MODULE_NAME python3)
 set(MODULE_INSTALL_SUBDIR python3)
diff --git a/modules/python/src2/cv2.hpp b/modules/python/src2/cv2.hpp
index b7992582ad..06080f1aa1 100644
--- a/modules/python/src2/cv2.hpp
+++ b/modules/python/src2/cv2.hpp
@@ -13,7 +13,10 @@
 // #define Py_DEBUG
 
 #if defined(CVPY_DYNAMIC_INIT) && !defined(Py_DEBUG)
-#   define Py_LIMITED_API 0x03030000
+#   ifndef PYTHON3_LIMITED_API_VERSION
+#       define PYTHON3_LIMITED_API_VERSION 0x03060000
+#   endif
+#   define Py_LIMITED_API PYTHON3_LIMITED_API_VERSION
 #endif
 
 #include <cmath>
@@ -42,17 +45,20 @@ class ArgInfo
 private:
     static const uint32_t arg_outputarg_flag     = 0x1;
     static const uint32_t arg_arithm_op_src_flag = 0x2;
+    static const uint32_t arg_pathlike_flag      = 0x4;
 
 public:
     const char* name;
     bool outputarg;
     bool arithm_op_src;
+    bool pathlike;
     // more fields may be added if necessary
 
     ArgInfo(const char* name_, uint32_t arg_) :
         name(name_),
         outputarg((arg_ & arg_outputarg_flag) != 0),
-        arithm_op_src((arg_ & arg_arithm_op_src_flag) != 0) {}
+        arithm_op_src((arg_ & arg_arithm_op_src_flag) != 0),
+        pathlike((arg_ & arg_pathlike_flag) != 0) {}
 
 private:
     ArgInfo(const ArgInfo&) = delete;
diff --git a/modules/python/src2/cv2_convert.cpp b/modules/python/src2/cv2_convert.cpp
index 40e1608fae..c4a867892a 100644
--- a/modules/python/src2/cv2_convert.cpp
+++ b/modules/python/src2/cv2_convert.cpp
@@ -701,6 +701,18 @@ bool pyopencv_to(PyObject* obj, String &value, const ArgInfo& info)
         return true;
     }
     std::string str;
+
+#if ((PY_VERSION_HEX >= 0x03060000) && !defined(Py_LIMITED_API)) || (Py_LIMITED_API >= 0x03060000)
+    if (info.pathlike)
+    {
+        obj = PyOS_FSPath(obj);
+        if (PyErr_Occurred())
+        {
+            failmsg("Expected '%s' to be a str or path-like object", info.name);
+            return false;
+        }
+    }
+#endif
     if (getUnicodeString(obj, str))
     {
         value = str;
diff --git a/modules/python/src2/cv2_util.cpp b/modules/python/src2/cv2_util.cpp
index d3691d3a59..817a4a8eff 100644
--- a/modules/python/src2/cv2_util.cpp
+++ b/modules/python/src2/cv2_util.cpp
@@ -128,11 +128,7 @@ void pyPopulateArgumentConversionErrors()
         PySafeObject exception_message(PyObject_Str(exception_value));
         std::string message;
         getUnicodeString(exception_message, message);
-#ifdef CV_CXX11
         conversionErrorsTLS.getRef().push_back(std::move(message));
-#else
-        conversionErrorsTLS.getRef().push_back(message);
-#endif
     }
 }
 
diff --git a/modules/python/src2/gen2.py b/modules/python/src2/gen2.py
index f37e153779..e6fec5510e 100755
--- a/modules/python/src2/gen2.py
+++ b/modules/python/src2/gen2.py
@@ -500,6 +500,10 @@ class ArgInfo(object):
     def outputarg(self):
         return '/O' in self._modifiers or '/IO' in self._modifiers
 
+    @property
+    def pathlike(self):
+        return '/PATH' in self._modifiers
+
     @property
     def returnarg(self):
         return self.outputarg
@@ -523,6 +527,7 @@ class ArgInfo(object):
     def crepr(self):
         arg  = 0x01 if self.outputarg else 0x0
         arg += 0x02 if self.arithm_op_src_arg else 0x0
+        arg += 0x04 if self.pathlike else 0x0
         return "ArgInfo(\"%s\", %d)" % (self.name, arg)
 
 
diff --git a/modules/python/src2/hdr_parser.py b/modules/python/src2/hdr_parser.py
index c9139c516b..9b00e5d0be 100755
--- a/modules/python/src2/hdr_parser.py
+++ b/modules/python/src2/hdr_parser.py
@@ -92,6 +92,10 @@ class CppHeaderParser(object):
             modlist.append("/IO")
             arg_str = arg_str.replace("CV_IN_OUT", "")
 
+        if "CV_WRAP_FILE_PATH" in arg_str:
+            modlist.append("/PATH")
+            arg_str = arg_str.replace("CV_WRAP_FILE_PATH", "")
+
         isarray = False
         npos = arg_str.find("CV_CARRAY")
         if npos >= 0:
@@ -453,8 +457,7 @@ class CppHeaderParser(object):
                                                  ("CV_INLINE", ""),
                                                  ("CV_DEPRECATED", ""),
                                                  ("CV_DEPRECATED_EXTERNAL", ""),
-                                                 ("CV_NODISCARD_STD", ""),
-                                                 ("CV_NODISCARD", "")]).strip()
+                                                 ("CV_NODISCARD_STD", "")]).strip()
 
         if decl_str.strip().startswith('virtual'):
             virtual_method = True
@@ -629,6 +632,8 @@ class CppHeaderParser(object):
                                                              ("noArray", arg_type)]).strip()
                     if '/IO' in modlist and '/O' in modlist:
                         modlist.remove('/O')
+                    if (arg_name.lower() == 'filename' or arg_name.lower() == 'filepath') and '/PATH' not in modlist:
+                        modlist.append('/PATH')
                     args.append([arg_type, arg_name, defval, modlist])
                 npos = arg_start-1
 
diff --git a/modules/python/test/test_pathlike.py b/modules/python/test/test_pathlike.py
new file mode 100644
index 0000000000..d654ce24ad
--- /dev/null
+++ b/modules/python/test/test_pathlike.py
@@ -0,0 +1,38 @@
+from tests_common import NewOpenCVTests, unittest
+import cv2 as cv
+import os
+
+
+def import_path():
+    import sys
+    if sys.version_info[0] < 3 or sys.version_info[1] < 6:
+        raise unittest.SkipTest('Python 3.6+ required')
+
+    from pathlib import Path
+    return Path
+
+
+class CanPassPathLike(NewOpenCVTests):
+    def test_pathlib_path(self):
+        Path = import_path()
+
+        img_path = self.find_file('cv/imgproc/stuff.jpg', [os.environ.get('OPENCV_TEST_DATA_PATH')])
+
+        image_from_str = cv.imread(img_path)
+        self.assertIsNotNone(image_from_str)
+
+        image_from_path = cv.imread(Path(img_path))
+        self.assertIsNotNone(image_from_path)
+
+
+    def test_type_mismatch(self):
+        import_path() # checks python version
+
+        with self.assertRaises(TypeError) as context:
+            cv.imread(123)
+
+        self.assertTrue('str or path-like' in str(context.exception))
+
+
+if __name__ == '__main__':
+    NewOpenCVTests.bootstrap()
diff --git a/modules/stitching/src/precomp.hpp b/modules/stitching/src/precomp.hpp
index 3b9cf0f85b..0718ab866b 100644
--- a/modules/stitching/src/precomp.hpp
+++ b/modules/stitching/src/precomp.hpp
@@ -45,6 +45,7 @@
 
 #include "opencv2/opencv_modules.hpp"
 
+#include <array>
 #include <vector>
 #include <algorithm>
 #include <utility>
diff --git a/modules/ts/CMakeLists.txt b/modules/ts/CMakeLists.txt
index c1d249ea14..63edae1e67 100644
--- a/modules/ts/CMakeLists.txt
+++ b/modules/ts/CMakeLists.txt
@@ -47,3 +47,7 @@ if(OPENCV_DISABLE_THREAD_SUPPORT)
   # described in `ts_gtest.h`.
   ocv_target_compile_definitions(${the_module} PUBLIC GTEST_HAS_PTHREAD=0)
 endif()
+
+if(CMAKE_SYSTEM_NAME STREQUAL "QNX")
+  ocv_target_link_libraries(${the_module} PUBLIC regex)
+endif()
\ No newline at end of file
diff --git a/modules/ts/include/opencv2/ts.hpp b/modules/ts/include/opencv2/ts.hpp
index 86f2d07761..a768d0047b 100644
--- a/modules/ts/include/opencv2/ts.hpp
+++ b/modules/ts/include/opencv2/ts.hpp
@@ -941,13 +941,9 @@ namespace opencv_test {
 using namespace cvtest;
 using namespace cv;
 
-#ifdef CV_CXX11
 #define CVTEST_GUARD_SYMBOL(name) \
     class required_namespace_specificatin_here_for_symbol_ ## name {}; \
     using name = required_namespace_specificatin_here_for_symbol_ ## name;
-#else
-#define CVTEST_GUARD_SYMBOL(name) /* nothing */
-#endif
 
 CVTEST_GUARD_SYMBOL(norm)
 CVTEST_GUARD_SYMBOL(add)
diff --git a/modules/video/include/opencv2/video/tracking.hpp b/modules/video/include/opencv2/video/tracking.hpp
index 8dbcfbf216..df34a9f97c 100644
--- a/modules/video/include/opencv2/video/tracking.hpp
+++ b/modules/video/include/opencv2/video/tracking.hpp
@@ -564,6 +564,12 @@ public:
     /** @copybrief getGamma @see getGamma */
     CV_WRAP virtual void setGamma(float val) = 0;
 
+    /** @brief Norm value shift for robust penalizer
+    @see setEpsilon */
+    CV_WRAP virtual float getEpsilon() const = 0;
+    /** @copybrief getEpsilon @see getEpsilon */
+    CV_WRAP virtual void setEpsilon(float val) = 0;
+
     /** @brief Creates an instance of VariationalRefinement
     */
     CV_WRAP static Ptr<VariationalRefinement> create();
@@ -645,6 +651,12 @@ public:
     /** @copybrief getVariationalRefinementGamma @see getVariationalRefinementGamma */
     CV_WRAP virtual void setVariationalRefinementGamma(float val) = 0;
 
+    /** @brief Norm value shift for robust penalizer
+    @see setVariationalRefinementEpsilon */
+    CV_WRAP virtual float getVariationalRefinementEpsilon() const = 0;
+    /** @copybrief getVariationalRefinementEpsilon @see getVariationalRefinementEpsilon */
+    CV_WRAP virtual void setVariationalRefinementEpsilon(float val) = 0;
+
 
     /** @brief Whether to use mean-normalization of patches when computing patch distance. It is turned on
         by default as it typically provides a noticeable quality boost because of increased robustness to
diff --git a/modules/video/src/dis_flow.cpp b/modules/video/src/dis_flow.cpp
index 40ac4517a4..75090d093d 100644
--- a/modules/video/src/dis_flow.cpp
+++ b/modules/video/src/dis_flow.cpp
@@ -67,6 +67,7 @@ class DISOpticalFlowImpl CV_FINAL : public DISOpticalFlow
     float variational_refinement_alpha;
     float variational_refinement_gamma;
     float variational_refinement_delta;
+    float variational_refinement_epsilon;
     bool use_mean_normalization;
     bool use_spatial_propagation;
 
@@ -92,6 +93,8 @@ class DISOpticalFlowImpl CV_FINAL : public DISOpticalFlow
     void setVariationalRefinementDelta(float val) CV_OVERRIDE { variational_refinement_delta = val; }
     float getVariationalRefinementGamma() const CV_OVERRIDE { return variational_refinement_gamma; }
     void setVariationalRefinementGamma(float val) CV_OVERRIDE { variational_refinement_gamma = val; }
+    float getVariationalRefinementEpsilon() const CV_OVERRIDE { return variational_refinement_epsilon; }
+    void setVariationalRefinementEpsilon(float val) CV_OVERRIDE { variational_refinement_epsilon = val; }
 
     bool getUseMeanNormalization() const CV_OVERRIDE { return use_mean_normalization; }
     void setUseMeanNormalization(bool val) CV_OVERRIDE { use_mean_normalization = val; }
@@ -219,6 +222,7 @@ DISOpticalFlowImpl::DISOpticalFlowImpl()
     variational_refinement_alpha = 20.f;
     variational_refinement_gamma = 10.f;
     variational_refinement_delta = 5.f;
+    variational_refinement_epsilon = 0.01f;
 
     border_size = 16;
     use_mean_normalization = true;
@@ -306,6 +310,7 @@ void DISOpticalFlowImpl::prepareBuffers(Mat &I0, Mat &I1, Mat &flow, bool use_fl
             variational_refinement_processors[i]->setAlpha(variational_refinement_alpha);
             variational_refinement_processors[i]->setDelta(variational_refinement_delta);
             variational_refinement_processors[i]->setGamma(variational_refinement_gamma);
+            variational_refinement_processors[i]->setEpsilon(variational_refinement_epsilon);
             variational_refinement_processors[i]->setSorIterations(5);
             variational_refinement_processors[i]->setFixedPointIterations(variational_refinement_iter);
 
@@ -1274,6 +1279,7 @@ void DISOpticalFlowImpl::ocl_prepareBuffers(UMat &I0, UMat &I1, InputArray flow,
             variational_refinement_processors[i]->setAlpha(variational_refinement_alpha);
             variational_refinement_processors[i]->setDelta(variational_refinement_delta);
             variational_refinement_processors[i]->setGamma(variational_refinement_gamma);
+            variational_refinement_processors[i]->setEpsilon(variational_refinement_epsilon);
             variational_refinement_processors[i]->setSorIterations(5);
             variational_refinement_processors[i]->setFixedPointIterations(variational_refinement_iter);
 
diff --git a/modules/video/src/variational_refinement.cpp b/modules/video/src/variational_refinement.cpp
index 968bce6717..b1891c60df 100644
--- a/modules/video/src/variational_refinement.cpp
+++ b/modules/video/src/variational_refinement.cpp
@@ -76,6 +76,8 @@ class VariationalRefinementImpl CV_FINAL : public VariationalRefinement
     void setDelta(float val) CV_OVERRIDE { delta = val; }
     float getGamma() const CV_OVERRIDE { return gamma; }
     void setGamma(float val) CV_OVERRIDE { gamma = val; }
+    float getEpsilon() const CV_OVERRIDE { return epsilon; }
+    void setEpsilon(float val) CV_OVERRIDE { epsilon = val; }
 
   protected: //!< internal buffers
     /* This struct defines a special data layout for Mat_<float>. Original buffer is split into two: one for "red"
diff --git a/modules/videoio/misc/objc/ios/CvPhotoCamera2.m b/modules/videoio/misc/objc/ios/CvPhotoCamera2.m
index 460cce6d32..281929c558 100644
--- a/modules/videoio/misc/objc/ios/CvPhotoCamera2.m
+++ b/modules/videoio/misc/objc/ios/CvPhotoCamera2.m
@@ -105,7 +105,7 @@
 {
     // setup still image output with jpeg codec
     self.stillImageOutput = [[AVCaptureStillImageOutput alloc] init];
-    NSDictionary *outputSettings = [NSDictionary dictionaryWithObjectsAndKeys:AVVideoCodecJPEG, AVVideoCodecKey, nil];
+    NSDictionary *outputSettings = [NSDictionary dictionaryWithObjectsAndKeys:AVVideoCodecTypeJPEG, AVVideoCodecKey, nil];
     [self.stillImageOutput setOutputSettings:outputSettings];
     [self.captureSession addOutput:self.stillImageOutput];
 
diff --git a/modules/videoio/misc/objc/ios/CvVideoCamera2.mm b/modules/videoio/misc/objc/ios/CvVideoCamera2.mm
index 7f4abdb578..188d6c5ec7 100644
--- a/modules/videoio/misc/objc/ios/CvVideoCamera2.mm
+++ b/modules/videoio/misc/objc/ios/CvVideoCamera2.mm
@@ -315,7 +315,7 @@ static CGFloat DegreesToRadians(CGFloat degrees) {return degrees * M_PI / 180;}
     NSDictionary *outputSettings
      = [NSDictionary dictionaryWithObjectsAndKeys:[NSNumber numberWithInt:self.imageWidth], AVVideoWidthKey,
                                                   [NSNumber numberWithInt:self.imageHeight], AVVideoHeightKey,
-                                                  AVVideoCodecH264, AVVideoCodecKey,
+                                                  AVVideoCodecTypeH264, AVVideoCodecKey,
                                                   nil
      ];
 
diff --git a/modules/videoio/src/cap_avfoundation_mac.mm b/modules/videoio/src/cap_avfoundation_mac.mm
index c0ad4810d4..98df630c74 100644
--- a/modules/videoio/src/cap_avfoundation_mac.mm
+++ b/modules/videoio/src/cap_avfoundation_mac.mm
@@ -1220,13 +1220,13 @@ CvVideoWriter_AVFoundation::CvVideoWriter_AVFoundation(const std::string &filena
         is_good = false;
     }
 
-    // Three codec supported AVVideoCodecH264 AVVideoCodecJPEG AVVideoCodecTypeHEVC
+    // Three codec supported AVVideoCodecTypeH264 AVVideoCodecTypeJPEG AVVideoCodecTypeHEVC
     // On iPhone 3G H264 is not supported.
     if (fourcc == CV_FOURCC('J','P','E','G') || fourcc == CV_FOURCC('j','p','e','g') ||
             fourcc == CV_FOURCC('M','J','P','G') || fourcc == CV_FOURCC('m','j','p','g')){
-        codec = [AVVideoCodecJPEG copy]; // Use JPEG codec if specified, otherwise H264
+        codec = [AVVideoCodecTypeJPEG copy]; // Use JPEG codec if specified, otherwise H264
     }else if(fourcc == CV_FOURCC('H','2','6','4') || fourcc == CV_FOURCC('a','v','c','1')){
-            codec = [AVVideoCodecH264 copy];
+            codec = [AVVideoCodecTypeH264 copy];
     // Available since macOS 10.13
 #if defined(__MAC_OS_X_VERSION_MIN_REQUIRED) && __MAC_OS_X_VERSION_MIN_REQUIRED >= 101300
     }else if(fourcc == CV_FOURCC('H','2','6','5') || fourcc == CV_FOURCC('h','v','c','1') ||
diff --git a/modules/videoio/src/cap_ffmpeg_impl.hpp b/modules/videoio/src/cap_ffmpeg_impl.hpp
index 3dcd4e81d5..9be4b08279 100644
--- a/modules/videoio/src/cap_ffmpeg_impl.hpp
+++ b/modules/videoio/src/cap_ffmpeg_impl.hpp
@@ -1587,8 +1587,11 @@ bool CvCapture_FFMPEG::grabFrame()
         if (picture_pts == AV_NOPTS_VALUE_) {
             if (!rawMode)
                 picture_pts = picture->CV_FFMPEG_PTS_FIELD != AV_NOPTS_VALUE_ && picture->CV_FFMPEG_PTS_FIELD != 0 ? picture->CV_FFMPEG_PTS_FIELD : picture->pkt_dts;
-            else
-                picture_pts = packet.pts != AV_NOPTS_VALUE_ && packet.pts != 0 ? packet.pts : packet.dts;
+            else {
+                const AVPacket& packet_raw = packet.data != 0 ? packet : packet_filtered;
+                picture_pts = packet_raw.pts != AV_NOPTS_VALUE_ && packet_raw.pts != 0 ? packet_raw.pts : packet_raw.dts;
+                if (picture_pts < 0) picture_pts = 0;
+            }
             frame_number++;
         }
     }
diff --git a/modules/videoio/src/cap_gstreamer.cpp b/modules/videoio/src/cap_gstreamer.cpp
index cdaccabe45..41e98794b9 100644
--- a/modules/videoio/src/cap_gstreamer.cpp
+++ b/modules/videoio/src/cap_gstreamer.cpp
@@ -2820,7 +2820,8 @@ void handleMessage(GstElement * pipeline)
 
         if (gst_is_missing_plugin_message(msg))
         {
-            CV_WARN("your GStreamer installation is missing a required plugin");
+            CV_WARN("your GStreamer installation is missing a required plugin: " <<
+                    gst_missing_plugin_message_get_description(msg));
         }
         else
         {
diff --git a/modules/videoio/src/cap_mfx_common.hpp b/modules/videoio/src/cap_mfx_common.hpp
index 9824e89dc5..b10d7115ba 100644
--- a/modules/videoio/src/cap_mfx_common.hpp
+++ b/modules/videoio/src/cap_mfx_common.hpp
@@ -334,26 +334,11 @@ protected:
 
 
 // TODO: move to core::util?
-#ifdef CV_CXX11
 #include <thread>
 static void sleep_ms(int64 ms)
 {
     std::this_thread::sleep_for(std::chrono::milliseconds(ms));
 }
-#elif defined(__linux__)
-#include <time.h>
-static void sleep_ms(int64 ms)
-{
-    nanosleep(ms * 1000 * 1000);
-}
-#elif defined _WIN32
-static void sleep_ms(int64 ms)
-{
-    Sleep(ms);
-}
-#else
-#error "Can not detect sleep_ms() implementation"
-#endif
 
 
 // Linux specific
diff --git a/modules/videoio/src/cap_msmf.cpp b/modules/videoio/src/cap_msmf.cpp
index 6fbcd2aa02..93545c615e 100644
--- a/modules/videoio/src/cap_msmf.cpp
+++ b/modules/videoio/src/cap_msmf.cpp
@@ -39,6 +39,7 @@
 #include <string>
 #include <algorithm>
 #include <deque>
+#include <iterator>
 #include <stdio.h>
 #include <stdarg.h>
 #include <string.h>
diff --git a/modules/videoio/src/cap_winrt_bridge.hpp b/modules/videoio/src/cap_winrt_bridge.hpp
index a1e134e6ab..b78f8544bb 100644
--- a/modules/videoio/src/cap_winrt_bridge.hpp
+++ b/modules/videoio/src/cap_winrt_bridge.hpp
@@ -33,7 +33,7 @@
 #include <ppltasks.h>
 #include <concrt.h>
 #include <agile.h>
-#include <opencv2\core.hpp>
+#include <opencv2/core.hpp>
 
 #include <mutex>
 #include <memory>
@@ -114,4 +114,4 @@ private:
     cv::Mat backInputMat;
 
     int deviceIndex, width, height;
-};
\ No newline at end of file
+};
diff --git a/modules/videoio/test/test_ffmpeg.cpp b/modules/videoio/test/test_ffmpeg.cpp
index 7e09d61729..88f0c8f4bd 100644
--- a/modules/videoio/test/test_ffmpeg.cpp
+++ b/modules/videoio/test/test_ffmpeg.cpp
@@ -235,6 +235,66 @@ const videoio_container_params_t videoio_container_params[] =
 
 INSTANTIATE_TEST_CASE_P(/**/, videoio_container, testing::ValuesIn(videoio_container_params));
 
+typedef tuple<VideoCaptureAPIs, string, int, int, int, int, int> videoio_container_get_params_t;
+typedef testing::TestWithParam<videoio_container_get_params_t > videoio_container_get;
+
+TEST_P(videoio_container_get, read)
+{
+    const VideoCaptureAPIs api = get<0>(GetParam());
+
+    if (!videoio_registry::hasBackend(api))
+        throw SkipTestException("Backend was not found");
+
+    const string fileName = get<1>(GetParam());
+    const int height = get<2>(GetParam());
+    const int width = get<3>(GetParam());
+    const int nFrames = get<4>(GetParam());
+    const int bitrate = get<5>(GetParam());
+    const int fps = get<6>(GetParam());
+
+    VideoCapture container(findDataFile(fileName), api, { CAP_PROP_FORMAT, -1 });
+    if (!container.isOpened())
+        throw SkipTestException("Video stream is not supported");
+
+    const int heightProp = static_cast<int>(container.get(CAP_PROP_FRAME_HEIGHT));
+    ASSERT_EQ(height, heightProp);
+    const int widthProp = static_cast<int>(container.get(CAP_PROP_FRAME_WIDTH));
+    ASSERT_EQ(width, widthProp);
+    const int nFramesProp = static_cast<int>(container.get(CAP_PROP_FRAME_COUNT));
+    ASSERT_EQ(nFrames, nFramesProp);
+    const int bitrateProp = static_cast<int>(container.get(CAP_PROP_BITRATE));
+    ASSERT_EQ(bitrate, bitrateProp);
+    const double fpsProp = container.get(CAP_PROP_FPS);
+    ASSERT_EQ(fps, fpsProp);
+    // remove when PR fixing raw video CAP_PROP_POS_MSEC return value is merged and windows dll is updated
+#ifndef _WIN32
+    vector<int> displayTimeMs;
+    int iFrame = 1;
+    while (container.grab()) {
+        displayTimeMs.push_back(static_cast<int>(container.get(CAP_PROP_POS_MSEC)));
+        const int iFrameProp = static_cast<int>(container.get(CAP_PROP_POS_FRAMES));
+        ASSERT_EQ(iFrame++, iFrameProp);
+    }
+    sort(displayTimeMs.begin(), displayTimeMs.end());
+    vector<int> displayTimeDiffMs(displayTimeMs.size());
+    std::adjacent_difference(displayTimeMs.begin(), displayTimeMs.end(), displayTimeDiffMs.begin());
+    auto minTimeMsIt = min_element(displayTimeDiffMs.begin() + 1, displayTimeDiffMs.end());
+    auto maxTimeMsIt = max_element(displayTimeDiffMs.begin() + 1, displayTimeDiffMs.end());
+    const int frameTimeMs = static_cast<int>(1000.0 / fps);
+    ASSERT_NEAR(frameTimeMs, *minTimeMsIt, 1);
+    ASSERT_NEAR(frameTimeMs, *maxTimeMsIt, 1);
+#endif
+}
+
+const videoio_container_get_params_t videoio_container_get_params[] =
+{
+    videoio_container_get_params_t(CAP_FFMPEG, "video/big_buck_bunny.mp4", 384, 672, 125, 483, 24),
+    videoio_container_get_params_t(CAP_FFMPEG, "video/big_buck_bunny.mjpg.avi", 384, 672, 125, 2713, 24),
+    videoio_container_get_params_t(CAP_FFMPEG, "video/sample_322x242_15frames.yuv420p.libx264.mp4", 242, 322, 15, 542, 25)
+};
+
+INSTANTIATE_TEST_CASE_P(/**/, videoio_container_get, testing::ValuesIn(videoio_container_get_params));
+
 typedef tuple<string, string, int, int> videoio_encapsulate_params_t;
 typedef testing::TestWithParam< videoio_encapsulate_params_t > videoio_encapsulate;
 
diff --git a/modules/videoio/test/test_precomp.hpp b/modules/videoio/test/test_precomp.hpp
index 7106faeb1e..cf37b09559 100644
--- a/modules/videoio/test/test_precomp.hpp
+++ b/modules/videoio/test/test_precomp.hpp
@@ -6,6 +6,7 @@
 
 #include <sstream>
 #include <algorithm>
+#include <numeric>
 
 #include "opencv2/ts.hpp"
 #include "opencv2/ts/ocl_test.hpp"
diff --git a/platforms/android/aar-template/OpenCV/build.gradle.template b/platforms/android/aar-template/OpenCV/build.gradle.template
index 4f3a3846ec..10c8e64aa7 100644
--- a/platforms/android/aar-template/OpenCV/build.gradle.template
+++ b/platforms/android/aar-template/OpenCV/build.gradle.template
@@ -39,7 +39,6 @@ android {
         }
     }
     buildFeatures {
-        aidl true
         prefabPublishing true
         buildConfig true
     }
@@ -52,13 +51,13 @@ android {
         main {
             java.srcDirs = ['src/main/java']
             //jniLibs.srcDirs = ['libs']
-            aidl.srcDirs = ['src/main/java']
         }
     }
 
     publishing {
         singleVariant('release') {
             withSourcesJar()
+            withJavadocJar()
         }
     }
 }
@@ -66,14 +65,42 @@ android {
 publishing {
     publications {
         release(MavenPublication) {
+            // Builds aar, sources jar and javadoc jar from project sources and creates maven
+            groupId = 'org.opencv'
+            artifactId = '${PACKAGE_NAME}'
+            version = '${OPENCV_VERSION}'
+            afterEvaluate {
+                from components.release
+            }
+        }
+        modified(MavenPublication) {
+            // Creates maven from opencv-release.aar
             groupId = 'org.opencv'
             artifactId = '${PACKAGE_NAME}'
             version = '${OPENCV_VERSION}'
             artifact("opencv-release.aar")
-
-//            afterEvaluate {
-//                from components.release
-//            }
+            pom {
+                name = "OpenCV"
+                description = "Open Source Computer Vision Library"
+                url = "https://opencv.org/"
+                licenses {
+                    license {
+                        name = "The Apache License, Version 2.0"
+                        url = "https://github.com/opencv/opencv/blob/master/LICENSE"
+                    }
+                }
+                developers {
+                    developer {
+                        id = "admin"
+                        name = "OpenCV Team"
+                        email = "admin@opencv.org"
+                    }
+                }
+                scm {
+                    connection = "scm:git:https://github.com/opencv/opencv.git"
+                    url = "https://github.com/opencv/opencv"
+                }
+            }
         }
     }
     repositories {
@@ -85,4 +112,4 @@ publishing {
 }
 
 dependencies {
-}
\ No newline at end of file
+}
diff --git a/platforms/android/build_java_shared_aar.py b/platforms/android/build_java_shared_aar.py
index e99c78ec28..ffb63c67e5 100755
--- a/platforms/android/build_java_shared_aar.py
+++ b/platforms/android/build_java_shared_aar.py
@@ -144,6 +144,8 @@ def main(args):
     print("Creating local maven repo...")
 
     shutil.copy(final_aar_path, path.join(ANDROID_PROJECT_DIR, "OpenCV/opencv-release.aar"))
+
+    print("Creating a maven repo from project sources (with sources jar and javadoc jar)...")
     subprocess.run(["./gradlew", "publishReleasePublicationToMyrepoRepository"],
             shell=False,
             cwd=ANDROID_PROJECT_DIR,
@@ -153,6 +155,17 @@ def main(args):
     shutil.move(path.join(ANDROID_PROJECT_DIR, "OpenCV/build/repo/org/opencv", MAVEN_PACKAGE_NAME),
                 path.join(FINAL_REPO_PATH, "org/opencv", MAVEN_PACKAGE_NAME))
 
+    print("Creating a maven repo from modified AAR (with cpp libraries)...")
+    subprocess.run(["./gradlew", "publishModifiedPublicationToMyrepoRepository"],
+            shell=False,
+            cwd=ANDROID_PROJECT_DIR,
+            check=True)
+
+    # Replacing AAR from the first maven repo with modified AAR from the second maven repo
+    shutil.copytree(path.join(ANDROID_PROJECT_DIR, "OpenCV/build/repo/org/opencv", MAVEN_PACKAGE_NAME),
+                    path.join(FINAL_REPO_PATH, "org/opencv", MAVEN_PACKAGE_NAME),
+                    dirs_exist_ok=True)
+
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Builds AAR with Java and shared C++ libs from OpenCV SDK")
diff --git a/platforms/android/build_static_aar.py b/platforms/android/build_static_aar.py
index c1ab4046f4..20054047fa 100755
--- a/platforms/android/build_static_aar.py
+++ b/platforms/android/build_static_aar.py
@@ -216,6 +216,7 @@ def main(args):
 
     shutil.copy(final_aar_path, path.join(ANDROID_PROJECT_DIR, "OpenCV/opencv-release.aar"))
 
+    print("Creating a maven repo from project sources (with sources jar and javadoc jar)...")
     subprocess.run(["./gradlew", "publishReleasePublicationToMyrepoRepository"],
             shell=False,
             cwd=ANDROID_PROJECT_DIR,
@@ -224,6 +225,18 @@ def main(args):
     os.makedirs(path.join(FINAL_REPO_PATH, "org/opencv"), exist_ok=True)
     shutil.move(path.join(ANDROID_PROJECT_DIR, "OpenCV/build/repo/org/opencv", MAVEN_PACKAGE_NAME),
                 path.join(FINAL_REPO_PATH, "org/opencv", MAVEN_PACKAGE_NAME))
+
+    print("Creating a maven repo from modified AAR (with cpp libraries)...")
+    subprocess.run(["./gradlew", "publishModifiedPublicationToMyrepoRepository"],
+            shell=False,
+            cwd=ANDROID_PROJECT_DIR,
+            check=True)
+
+    # Replacing AAR from the first maven repo with modified AAR from the second maven repo
+    shutil.copytree(path.join(ANDROID_PROJECT_DIR, "OpenCV/build/repo/org/opencv", MAVEN_PACKAGE_NAME),
+                    path.join(FINAL_REPO_PATH, "org/opencv", MAVEN_PACKAGE_NAME),
+                    dirs_exist_ok=True)
+
     print("Done")
 
 
diff --git a/platforms/linux/riscv64-071-gcc.toolchain.cmake b/platforms/linux/riscv64-071-gcc.toolchain.cmake
index 53e4a7fced..0542006570 100644
--- a/platforms/linux/riscv64-071-gcc.toolchain.cmake
+++ b/platforms/linux/riscv64-071-gcc.toolchain.cmake
@@ -4,5 +4,54 @@ set(CMAKE_SYSTEM_PROCESSOR riscv64)
 set(CMAKE_CXX_COMPILER riscv64-unknown-linux-gnu-g++)
 set(CMAKE_C_COMPILER  riscv64-unknown-linux-gnu-gcc)
 
-set(CMAKE_CXX_FLAGS_INIT "-march=rv64gcv -mabi=lp64d -D__riscv_vector_071")
-set(CMAKE_C_FLAGS_INIT "-march=rv64gcv -mabi=lp64d -D__riscv_vector_071")
+# MangoPi MQ Pro - C906FD, C906FDV
+# Lichee Pi 4A - C910, C910V (?)
+# CanMV K230 - C908, C908V
+
+# See https://github.com/T-head-Semi/gcc/blob/xuantie-gcc-10.4.0/gcc/config/riscv/riscv-cores.def
+
+set(_enable_vector OFF)
+if(CORE STREQUAL "C906FD")
+  set(CMAKE_C_FLAGS_INIT "-mcpu=c906fd -mabi=lp64d  -mtune=c906fd")
+  set(CMAKE_CXX_FLAGS_INIT "-mcpu=c906fd -mabi=lp64d  -mtune=c906fd")
+elseif(CORE STREQUAL "C906FDV")
+  set(CMAKE_C_FLAGS_INIT "-mcpu=c906fd -mabi=lp64d  -mtune=c906fd")
+  set(CMAKE_CXX_FLAGS_INIT "-mcpu=c906fd -mabi=lp64d  -mtune=c906fd")
+  # Disabled due to limited 64-bit SEW support
+  # set(_enable_vector ON)
+elseif(CORE STREQUAL "C908")
+  set(CMAKE_C_FLAGS_INIT "-mcpu=c908 -mabi=lp64d  -mtune=c908")
+  set(CMAKE_CXX_FLAGS_INIT "-mcpu=c908 -mabi=lp64d  -mtune=c908")
+elseif(CORE STREQUAL "C908V")
+  set(CMAKE_C_FLAGS_INIT "-mcpu=c908v -mabi=lp64d  -mtune=c908")
+  set(CMAKE_CXX_FLAGS_INIT "-mcpu=c908v -mabi=lp64d  -mtune=c908")
+  set(_enable_vector ON) # RVV 1.0
+elseif(CORE STREQUAL "C910")
+  set(CMAKE_C_FLAGS_INIT "-mcpu=c910 -mabi=lp64d -mtune=c910")
+  set(CMAKE_CXX_FLAGS_INIT "-mcpu=c910 -mabi=lp64d -mtune=c910")
+elseif(CORE STREQUAL "C910V")
+  set(CMAKE_C_FLAGS_INIT "-march=rv64imafdcv0p7xthead -mabi=lp64d")
+  set(CMAKE_CXX_FLAGS_INIT "-march=rv64imafdcv0p7xthead -mabi=lp64d")
+  set(_enable_vector ON) # RVV 0.7.1
+elseif(CORE STREQUAL "C920")
+  set(CMAKE_C_FLAGS_INIT "-mcpu=c920 -mabi=lp64d  -mtune=c920")
+  set(CMAKE_CXX_FLAGS_INIT "-mcpu=c920 -mabi=lp64d  -mtune=c920")
+  set(_enable_vector ON) # RVV 0.7.1
+elseif(CORE STREQUAL "C920V2")
+  set(CMAKE_C_FLAGS_INIT "-mcpu=c920v2 -mabi=lp64d  -mtune=c920v2")
+  set(CMAKE_CXX_FLAGS_INIT "-mcpu=c920v2 -mabi=lp64d  -mtune=c920v2")
+  set(_enable_vector ON) # RVV 1.0
+else()
+  set(CMAKE_C_FLAGS_INIT "-march=rv64imafdc_zihintpause_zfh_zba_zbb_zbc_zbs_xtheadc -mabi=lp64d")
+  set(CMAKE_CXX_FLAGS_INIT "-march=rv64imafdc_zihintpause_zfh_zba_zbb_zbc_zbs_xtheadc -mabi=lp64d")
+endif()
+
+if(_enable_vector)
+  set(CMAKE_C_FLAGS_INIT "${CMAKE_C_FLAGS_INIT} -D__riscv_vector_071 -mrvv-vector-bits=128")
+  set(CMAKE_CXX_FLAGS_INIT "${CMAKE_CXX_FLAGS_INIT} -D__riscv_vector_071 -mrvv-vector-bits=128")
+endif()
+
+if(ENABLE_GCOV)
+  set(CMAKE_CXX_FLAGS_INIT "${CMAKE_CXX_FLAGS_INIT} -fprofile-arcs -ftest-coverage")
+  set(CMAKE_C_FLAGS_INIT "${CMAKE_C_FLAGS_INIT} -fprofile-arcs -ftest-coverage")
+endif()
diff --git a/platforms/linux/riscv64-andes-gcc.toolchain.cmake b/platforms/linux/riscv64-andes-gcc.toolchain.cmake
new file mode 100755
index 0000000000..ce733fc790
--- /dev/null
+++ b/platforms/linux/riscv64-andes-gcc.toolchain.cmake
@@ -0,0 +1,10 @@
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_PROCESSOR riscv64)
+
+set(RISCV_GCC_INSTALL_ROOT $ENV{RISCV} CACHE PATH "Path to GCC for RISC-V cross compiler installation directory")
+
+set(CMAKE_C_COMPILER  ${RISCV_GCC_INSTALL_ROOT}/bin/riscv64-linux-gcc)
+set(CMAKE_CXX_COMPILER ${RISCV_GCC_INSTALL_ROOT}/bin/riscv64-linux-g++)
+
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=rv64gc -mext-dsp")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=rv64gc -mext-dsp")
diff --git a/samples/android/15-puzzle/build.gradle.in b/samples/android/15-puzzle/build.gradle.in
index bf8921b98d..e7f6b4af56 100644
--- a/samples/android/15-puzzle/build.gradle.in
+++ b/samples/android/15-puzzle/build.gradle.in
@@ -19,7 +19,6 @@ android {
     sourceSets {
         main {
             java.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@
-            aidl.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@
             res.srcDirs = @ANDROID_SAMPLE_RES_PATH@
             manifest.srcFile '@ANDROID_SAMPLE_MANIFEST_PATH@'
         }
@@ -31,7 +30,7 @@ dependencies {
     if (gradle.opencv_source == 'sdk_path') {
         println 'Using OpenCV from SDK'
         implementation project(':opencv')
-    } else if (gradle.opencv_source == 'maven_local' || gradle.opencv_source == 'maven_cenral') {
+    } else if (gradle.opencv_source == 'maven_local' || gradle.opencv_source == 'maven_central') {
         println 'Using OpenCV from Maven repo'
         implementation 'org.opencv:opencv:@OPENCV_VERSION_PLAIN@'
     }
diff --git a/samples/android/camera-calibration/build.gradle.in b/samples/android/camera-calibration/build.gradle.in
index d79df7777e..8c97fb22ab 100644
--- a/samples/android/camera-calibration/build.gradle.in
+++ b/samples/android/camera-calibration/build.gradle.in
@@ -19,7 +19,6 @@ android {
     sourceSets {
         main {
             java.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@
-            aidl.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@
             res.srcDirs = @ANDROID_SAMPLE_RES_PATH@
             manifest.srcFile '@ANDROID_SAMPLE_MANIFEST_PATH@'
         }
@@ -31,7 +30,7 @@ dependencies {
     if (gradle.opencv_source == "sdk_path") {
         println 'Using OpenCV from SDK'
         implementation project(':opencv')
-    } else if (gradle.opencv_source == "maven_local" || gradle.opencv_source == "maven_cenral") {
+    } else if (gradle.opencv_source == "maven_local" || gradle.opencv_source == "maven_central") {
         println 'Using OpenCV from Maven repo'
         implementation 'org.opencv:opencv:@OPENCV_VERSION_PLAIN@'
     }
diff --git a/samples/android/color-blob-detection/build.gradle.in b/samples/android/color-blob-detection/build.gradle.in
index 6d544592a4..bd29338970 100644
--- a/samples/android/color-blob-detection/build.gradle.in
+++ b/samples/android/color-blob-detection/build.gradle.in
@@ -19,7 +19,6 @@ android {
     sourceSets {
         main {
             java.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@
-            aidl.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@
             res.srcDirs = @ANDROID_SAMPLE_RES_PATH@
             manifest.srcFile '@ANDROID_SAMPLE_MANIFEST_PATH@'
         }
@@ -31,7 +30,7 @@ dependencies {
     if (gradle.opencv_source == "sdk_path") {
         println 'Using OpenCV from from SDK'
         implementation project(':opencv')
-    } else if (gradle.opencv_source == "maven_local" || gradle.opencv_source == "maven_cenral") {
+    } else if (gradle.opencv_source == "maven_local" || gradle.opencv_source == "maven_central") {
         println 'Using OpenCV from Maven repo'
         implementation 'org.opencv:opencv:@OPENCV_VERSION_PLAIN@'
     }
diff --git a/samples/android/face-detection/build.gradle.in b/samples/android/face-detection/build.gradle.in
index 6cc9d8cfb4..72dabd664d 100644
--- a/samples/android/face-detection/build.gradle.in
+++ b/samples/android/face-detection/build.gradle.in
@@ -19,7 +19,6 @@ android {
     sourceSets {
         main {
             java.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@
-            aidl.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@
             res.srcDirs = @ANDROID_SAMPLE_RES_PATH@
             manifest.srcFile '@ANDROID_SAMPLE_MANIFEST_PATH@'
         }
@@ -31,7 +30,7 @@ dependencies {
     if (gradle.opencv_source == "sdk_path") {
         println 'Using OpenCV from from SDK'
         implementation project(':opencv')
-    } else if (gradle.opencv_source == "maven_local" || gradle.opencv_source == "maven_cenral") {
+    } else if (gradle.opencv_source == "maven_local" || gradle.opencv_source == "maven_central") {
         println 'Using OpenCV from Maven repo'
         implementation 'org.opencv:opencv:@OPENCV_VERSION_PLAIN@'
     }
diff --git a/samples/android/face-detection/gradle/AndroidManifest.xml b/samples/android/face-detection/gradle/AndroidManifest.xml
index 5476bcfbfb..f018df2eec 100644
--- a/samples/android/face-detection/gradle/AndroidManifest.xml
+++ b/samples/android/face-detection/gradle/AndroidManifest.xml
@@ -11,7 +11,6 @@
                   android:exported="true"
                   android:name="FaceDetectActivity"
                   android:label="@string/app_name"
-                  android:screenOrientation="landscape"
                   android:configChanges="keyboardHidden|orientation">
             <intent-filter>
                 <action android:name="android.intent.action.MAIN" />
diff --git a/samples/android/face-detection/src/org/opencv/samples/facedetect/FaceDetectActivity.java b/samples/android/face-detection/src/org/opencv/samples/facedetect/FaceDetectActivity.java
index f487b184ab..1ba50aec87 100644
--- a/samples/android/face-detection/src/org/opencv/samples/facedetect/FaceDetectActivity.java
+++ b/samples/android/face-detection/src/org/opencv/samples/facedetect/FaceDetectActivity.java
@@ -184,8 +184,9 @@ public class FaceDetectActivity extends CameraActivity implements CvCameraViewLi
 
         mRgba = inputFrame.rgba();
 
-        if (mInputSize == null) {
-            mInputSize = new Size(Math.round(mRgba.cols()/mScale), Math.round(mRgba.rows()/mScale));
+        Size inputSize = new Size(Math.round(mRgba.cols()/mScale), Math.round(mRgba.rows()/mScale));
+        if (mInputSize == null || !mInputSize.equals(inputSize)) {
+            mInputSize = inputSize;
             mFaceDetector.setInputSize(mInputSize);
         }
 
diff --git a/samples/android/image-manipulations/build.gradle.in b/samples/android/image-manipulations/build.gradle.in
index a227d548cf..3c5034ea9b 100644
--- a/samples/android/image-manipulations/build.gradle.in
+++ b/samples/android/image-manipulations/build.gradle.in
@@ -19,7 +19,6 @@ android {
     sourceSets {
         main {
             java.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@
-            aidl.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@
             res.srcDirs = @ANDROID_SAMPLE_RES_PATH@
             manifest.srcFile '@ANDROID_SAMPLE_MANIFEST_PATH@'
         }
@@ -31,7 +30,7 @@ dependencies {
     if (gradle.opencv_source == "sdk_path") {
         println 'Using OpenCV from from SDK'
         implementation project(':opencv')
-    } else if (gradle.opencv_source == "maven_local" || gradle.opencv_source == "maven_cenral") {
+    } else if (gradle.opencv_source == "maven_local" || gradle.opencv_source == "maven_central") {
         println 'Using OpenCV from Maven repo'
         implementation 'org.opencv:opencv:@OPENCV_VERSION_PLAIN@'
     }
diff --git a/samples/android/mobilenet-objdetect/build.gradle.in b/samples/android/mobilenet-objdetect/build.gradle.in
index 4cb1789e19..9e8e49b668 100644
--- a/samples/android/mobilenet-objdetect/build.gradle.in
+++ b/samples/android/mobilenet-objdetect/build.gradle.in
@@ -19,7 +19,6 @@ android {
     sourceSets {
         main {
             java.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@
-            aidl.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@
             res.srcDirs = @ANDROID_SAMPLE_RES_PATH@
             manifest.srcFile '@ANDROID_SAMPLE_MANIFEST_PATH@'
         }
@@ -31,7 +30,7 @@ dependencies {
     if (gradle.opencv_source == "sdk_path") {
         println 'Using OpenCV from SDK'
         implementation project(':opencv')
-    } else if (gradle.opencv_source == "maven_local" || gradle.opencv_source == "maven_cenral") {
+    } else if (gradle.opencv_source == "maven_local" || gradle.opencv_source == "maven_central") {
         println 'Using OpenCV from Maven repo'
         implementation 'org.opencv:opencv:@OPENCV_VERSION_PLAIN@'
     }
diff --git a/samples/android/qr-detection/build.gradle.in b/samples/android/qr-detection/build.gradle.in
index 274f0b4129..0951b70cdb 100644
--- a/samples/android/qr-detection/build.gradle.in
+++ b/samples/android/qr-detection/build.gradle.in
@@ -19,7 +19,6 @@ android {
     sourceSets {
         main {
             java.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@
-            aidl.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@
             res.srcDirs = @ANDROID_SAMPLE_RES_PATH@
             manifest.srcFile '@ANDROID_SAMPLE_MANIFEST_PATH@'
         }
@@ -30,7 +29,7 @@ dependencies {
     //implementation fileTree(dir: 'libs', include: ['*.jar'])
     if (gradle.opencv_source == "sdk_path") {
         implementation project(':opencv')
-    } else if (gradle.opencv_source == "maven_local" || gradle.opencv_source == "maven_cenral") {
+    } else if (gradle.opencv_source == "maven_local" || gradle.opencv_source == "maven_central") {
         implementation 'org.opencv:opencv:@OPENCV_VERSION_PLAIN@'
     }
 }
diff --git a/samples/android/tutorial-1-camerapreview/build.gradle.in b/samples/android/tutorial-1-camerapreview/build.gradle.in
index a0d44eaf9a..7b308b2abb 100644
--- a/samples/android/tutorial-1-camerapreview/build.gradle.in
+++ b/samples/android/tutorial-1-camerapreview/build.gradle.in
@@ -19,7 +19,6 @@ android {
     sourceSets {
         main {
             java.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@
-            aidl.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@
             res.srcDirs = @ANDROID_SAMPLE_RES_PATH@
             manifest.srcFile '@ANDROID_SAMPLE_MANIFEST_PATH@'
         }
@@ -31,7 +30,7 @@ dependencies {
     if (gradle.opencv_source == "sdk_path") {
         println 'Using OpenCV from SDK'
         implementation project(':opencv')
-    } else if (gradle.opencv_source == "maven_local" || gradle.opencv_source == "maven_cenral") {
+    } else if (gradle.opencv_source == "maven_local" || gradle.opencv_source == "maven_central") {
         println 'Using OpenCV from Maven repo'
         implementation 'org.opencv:opencv:@OPENCV_VERSION_PLAIN@'
     }
diff --git a/samples/android/tutorial-1-camerapreview/gradle/AndroidManifest.xml b/samples/android/tutorial-1-camerapreview/gradle/AndroidManifest.xml
index 98f2a2f35b..56c9ad32d6 100644
--- a/samples/android/tutorial-1-camerapreview/gradle/AndroidManifest.xml
+++ b/samples/android/tutorial-1-camerapreview/gradle/AndroidManifest.xml
@@ -12,7 +12,6 @@
                   android:exported="true"
                   android:name="Tutorial1Activity"
                   android:label="@string/app_name"
-                  android:screenOrientation="landscape"
                   android:configChanges="keyboardHidden|orientation">
             <intent-filter>
                 <action android:name="android.intent.action.MAIN" />
@@ -27,13 +26,13 @@
                       android:largeScreens="true"
                       android:anyDensity="true" />
 
-    //! [camera_permissions]
+    <!--[camera_permissions]-->
     <uses-permission android:name="android.permission.CAMERA"/>
 
     <uses-feature android:name="android.hardware.camera" android:required="false"/>
     <uses-feature android:name="android.hardware.camera.autofocus" android:required="false"/>
     <uses-feature android:name="android.hardware.camera.front" android:required="false"/>
     <uses-feature android:name="android.hardware.camera.front.autofocus" android:required="false"/>
-    //! [camera_permissions]
+    <!--[camera_permissions]-->
 
 </manifest>
diff --git a/samples/android/tutorial-2-mixedprocessing/build.gradle.in b/samples/android/tutorial-2-mixedprocessing/build.gradle.in
index 4125d65a38..a156f42240 100644
--- a/samples/android/tutorial-2-mixedprocessing/build.gradle.in
+++ b/samples/android/tutorial-2-mixedprocessing/build.gradle.in
@@ -33,7 +33,6 @@ android {
     sourceSets {
         main {
             java.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@
-            aidl.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@
             res.srcDirs = @ANDROID_SAMPLE_RES_PATH@
             manifest.srcFile '@ANDROID_SAMPLE_MANIFEST_PATH@'
         }
@@ -44,7 +43,7 @@ android {
         }
     }
     buildFeatures {
-        if (gradle.opencv_source == "maven_local" || gradle.opencv_source == "maven_cenral") {
+        if (gradle.opencv_source == "maven_local" || gradle.opencv_source == "maven_central") {
             prefab true
         }
     }
@@ -55,7 +54,7 @@ dependencies {
     if (gradle.opencv_source == "sdk_path") {
         println 'Using OpenCV from SDK'
         implementation project(':opencv')
-    } else if (gradle.opencv_source == "maven_local" || gradle.opencv_source == "maven_cenral") {
+    } else if (gradle.opencv_source == "maven_local" || gradle.opencv_source == "maven_central") {
         println 'Using OpenCV from Maven repo'
         implementation 'org.opencv:opencv:@OPENCV_VERSION_PLAIN@'
     }
diff --git a/samples/android/tutorial-3-cameracontrol/build.gradle.in b/samples/android/tutorial-3-cameracontrol/build.gradle.in
index b7dffe86c5..d9c7f29ac3 100644
--- a/samples/android/tutorial-3-cameracontrol/build.gradle.in
+++ b/samples/android/tutorial-3-cameracontrol/build.gradle.in
@@ -19,7 +19,6 @@ android {
     sourceSets {
         main {
             java.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@
-            aidl.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@
             res.srcDirs = @ANDROID_SAMPLE_RES_PATH@
             manifest.srcFile '@ANDROID_SAMPLE_MANIFEST_PATH@'
         }
@@ -31,7 +30,7 @@ dependencies {
     if (gradle.opencv_source == "sdk_path") {
         println 'Using OpenCV from SDK'
         implementation project(':opencv')
-    } else if (gradle.opencv_source == "maven_local" || gradle.opencv_source == "maven_cenral") {
+    } else if (gradle.opencv_source == "maven_local" || gradle.opencv_source == "maven_central") {
         println 'Using OpenCV from Maven repo'
         implementation 'org.opencv:opencv:@OPENCV_VERSION_PLAIN@'
     }
diff --git a/samples/android/tutorial-4-opencl/build.gradle.in b/samples/android/tutorial-4-opencl/build.gradle.in
index 4675156df2..8eeb12b17d 100644
--- a/samples/android/tutorial-4-opencl/build.gradle.in
+++ b/samples/android/tutorial-4-opencl/build.gradle.in
@@ -35,7 +35,6 @@ android {
     sourceSets {
         main {
             java.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@
-            aidl.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@
             res.srcDirs = @ANDROID_SAMPLE_RES_PATH@
             manifest.srcFile '@ANDROID_SAMPLE_MANIFEST_PATH@'
         }
@@ -46,7 +45,7 @@ android {
         }
     }
     buildFeatures {
-        if (gradle.opencv_source == "maven_local" || gradle.opencv_source == "maven_cenral") {
+        if (gradle.opencv_source == "maven_local" || gradle.opencv_source == "maven_central") {
             prefab true
         }
     }
@@ -57,7 +56,7 @@ dependencies {
     if (gradle.opencv_source == "sdk_path") {
         println 'Using OpenCV from SDK'
         implementation project(':opencv')
-    } else if (gradle.opencv_source == "maven_local" || gradle.opencv_source == "maven_cenral") {
+    } else if (gradle.opencv_source == "maven_local" || gradle.opencv_source == "maven_central") {
         println 'Using OpenCV from Maven repo'
         implementation 'org.opencv:opencv:@OPENCV_VERSION_PLAIN@'
     }
diff --git a/samples/android/video-recorder/build.gradle.in b/samples/android/video-recorder/build.gradle.in
index b279623803..d096f3190a 100644
--- a/samples/android/video-recorder/build.gradle.in
+++ b/samples/android/video-recorder/build.gradle.in
@@ -19,7 +19,6 @@ android {
     sourceSets {
         main {
             java.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@
-            aidl.srcDirs = @ANDROID_SAMPLE_JAVA_PATH@
             res.srcDirs = @ANDROID_SAMPLE_RES_PATH@
             manifest.srcFile '@ANDROID_SAMPLE_MANIFEST_PATH@'
         }
@@ -30,7 +29,7 @@ dependencies {
     //implementation fileTree(dir: 'libs', include: ['*.jar'])
     if (gradle.opencv_source == "sdk_path") {
         implementation project(':opencv')
-    } else if (gradle.opencv_source == "maven_local" || gradle.opencv_source == "maven_cenral") {
+    } else if (gradle.opencv_source == "maven_local" || gradle.opencv_source == "maven_central") {
         implementation 'org.opencv:opencv:@OPENCV_VERSION_PLAIN@'
     }
 }
diff --git a/samples/cpp/tutorial_code/core/how_to_use_OpenCV_parallel_for_/how_to_use_OpenCV_parallel_for_.cpp b/samples/cpp/tutorial_code/core/how_to_use_OpenCV_parallel_for_/how_to_use_OpenCV_parallel_for_.cpp
index 2dcc1ff107..5fbe81cd1d 100644
--- a/samples/cpp/tutorial_code/core/how_to_use_OpenCV_parallel_for_/how_to_use_OpenCV_parallel_for_.cpp
+++ b/samples/cpp/tutorial_code/core/how_to_use_OpenCV_parallel_for_/how_to_use_OpenCV_parallel_for_.cpp
@@ -2,6 +2,8 @@
 #include <opencv2/core.hpp>
 #include <opencv2/imgcodecs.hpp>
 
+#define PARALLEL_FOR_LAMBDA
+
 using namespace std;
 using namespace cv;
 
@@ -33,6 +35,8 @@ int mandelbrotFormula(const complex<float> &z0, const int maxIter=500) {
 }
 //! [mandelbrot-grayscale-value]
 
+#ifndef PARALLEL_FOR_LAMBDA
+
 //! [mandelbrot-parallel]
 class ParallelMandelbrot : public ParallelLoopBody
 {
@@ -71,6 +75,8 @@ private:
 };
 //! [mandelbrot-parallel]
 
+#endif // !PARALLEL_FOR_LAMBDA
+
 //! [mandelbrot-sequential]
 void sequentialMandelbrot(Mat &img, const float x1, const float y1, const float scaleX, const float scaleY)
 {
@@ -102,7 +108,7 @@ int main()
 
     double t1 = (double) getTickCount();
 
-    #ifdef CV_CXX11
+#ifdef PARALLEL_FOR_LAMBDA
 
     //! [mandelbrot-parallel-call-cxx11]
     parallel_for_(Range(0, mandelbrotImg.rows*mandelbrotImg.cols), [&](const Range& range){
@@ -121,14 +127,14 @@ int main()
     });
     //! [mandelbrot-parallel-call-cxx11]
 
-    #else
+#else // PARALLEL_FOR_LAMBDA
 
     //! [mandelbrot-parallel-call]
     ParallelMandelbrot parallelMandelbrot(mandelbrotImg, x1, y1, scaleX, scaleY);
     parallel_for_(Range(0, mandelbrotImg.rows*mandelbrotImg.cols), parallelMandelbrot);
     //! [mandelbrot-parallel-call]
 
-    #endif
+#endif // PARALLEL_FOR_LAMBDA
 
     t1 = ((double) getTickCount() - t1) / getTickFrequency();
     cout << "Parallel Mandelbrot: " << t1 << " s" << endl;
diff --git a/samples/cpp/tutorial_code/core/how_to_use_OpenCV_parallel_for_/how_to_use_OpenCV_parallel_for_new.cpp b/samples/cpp/tutorial_code/core/how_to_use_OpenCV_parallel_for_/how_to_use_OpenCV_parallel_for_new.cpp
index cfa9d22b0d..cab73874a4 100644
--- a/samples/cpp/tutorial_code/core/how_to_use_OpenCV_parallel_for_/how_to_use_OpenCV_parallel_for_new.cpp
+++ b/samples/cpp/tutorial_code/core/how_to_use_OpenCV_parallel_for_/how_to_use_OpenCV_parallel_for_new.cpp
@@ -4,6 +4,8 @@
 #include <opencv2/imgproc.hpp>
 #include <opencv2/highgui.hpp>
 
+#define PARALLEL_FOR_LAMBDA
+
 using namespace std;
 using namespace cv;
 
@@ -47,7 +49,8 @@ void conv_seq(Mat src, Mat &dst, Mat kernel)
 }
 //! [convolution-sequential]
 
-#ifdef CV_CXX11
+#ifdef PARALLEL_FOR_LAMBDA
+
 void conv_parallel(Mat src, Mat &dst, Mat kernel)
 {
     int rows = src.rows, cols = src.cols;
@@ -118,7 +121,8 @@ void conv_parallel_row_split(Mat src, Mat &dst, Mat kernel)
                     });
     //! [convolution-parallel-cxx11-row-split]
 }
-#else
+
+#else // PARALLEL_FOR_LAMBDA
 
 //! [convolution-parallel]
 class parallelConvolution : public ParallelLoopBody
@@ -235,7 +239,7 @@ void conv_parallel_row_split(Mat src, Mat &dst, Mat kernel)
     //! [convolution-parallel-function-row]
 }
 
-#endif
+#endif // PARALLEL_FOR_LAMBDA
 
 static void help(char *progName)
 {
@@ -329,4 +333,4 @@ int main(int argc, char *argv[])
     // imwrite("dst.png", dst);
 
     return 0;
-}
\ No newline at end of file
+}
diff --git a/samples/cpp/tutorial_code/core/mat_the_basic_image_container/mat_the_basic_image_container.cpp b/samples/cpp/tutorial_code/core/mat_the_basic_image_container/mat_the_basic_image_container.cpp
index ac1c205258..d9e0d1f94d 100644
--- a/samples/cpp/tutorial_code/core/mat_the_basic_image_container/mat_the_basic_image_container.cpp
+++ b/samples/cpp/tutorial_code/core/mat_the_basic_image_container/mat_the_basic_image_container.cpp
@@ -59,12 +59,12 @@ int main(int,char**)
     cout << "C = " << endl << " " << C << endl << endl;
     //! [comma]
     // do the same with initializer_list
-#ifdef CV_CXX11
+
     //! [list]
     C = (Mat_<double>({0, -1, 0, -1, 5, -1, 0, -1, 0})).reshape(3);
     cout << "C = " << endl << " " << C << endl << endl;
     //! [list]
-#endif
+
     //! [clone]
     Mat RowClone = C.row(1).clone();
     cout << "RowClone = " << endl << " " << RowClone << endl << endl;
diff --git a/samples/dnn/object_detection.cpp b/samples/dnn/object_detection.cpp
index 1391b89434..0baf35c41e 100644
--- a/samples/dnn/object_detection.cpp
+++ b/samples/dnn/object_detection.cpp
@@ -5,7 +5,7 @@
 #include <opencv2/imgproc.hpp>
 #include <opencv2/highgui.hpp>
 
-#if defined(CV_CXX11) && defined(HAVE_THREADS)
+#if defined(HAVE_THREADS)
 #define USE_THREADS 1
 #endif