Merge branch 4.x

2025-06-07 17:44:04 +08:00 · 2024-01-23 17:06:52 +03:00 · 2024-01-23 17:06:52 +03:00 · decf6538a2
commit decf6538a2
parent d6424233f0 69772b0260
148 changed files with 3263 additions and 1564 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1455,8 +1455,8 @@ if(WITH_WEBP OR HAVE_WEBP)
 endif()

 if(WITH_AVIF OR HAVE_AVIF)
-  if(AVIF_VERSION)
-    status("    AVIF:" AVIF_FOUND THEN "${AVIF_LIBRARY} (ver ${AVIF_VERSION})" ELSE "NO")
+  if(libavif_VERSION)
+    status("    AVIF:" AVIF_FOUND THEN "${AVIF_LIBRARY} (ver ${libavif_VERSION})" ELSE "NO")
  else()
    status("    AVIF:" AVIF_FOUND THEN "${AVIF_LIBRARY}" ELSE "NO")
  endif()
@ -1852,6 +1852,7 @@ if(BUILD_opencv_python3)
  else()
    status("    Libraries:"   HAVE_opencv_python3  THEN  "${PYTHON3_LIBRARIES}"                                      ELSE NO)
  endif()
+  status("    Limited API:" PYTHON3_LIMITED_API THEN "YES (ver ${PYTHON3_LIMITED_API_VERSION})"                    ELSE NO)
  status("    numpy:"         PYTHON3_NUMPY_INCLUDE_DIRS THEN "${PYTHON3_NUMPY_INCLUDE_DIRS} (ver ${PYTHON3_NUMPY_VERSION})" ELSE "NO (Python3 wrappers can not be generated)")
  status("    install path:"  HAVE_opencv_python3  THEN "${__INSTALL_PATH_PYTHON3}"                            ELSE "-")
 endif()
--- a/README.md
+++ b/README.md
@ -1,8 +1,5 @@
 ## OpenCV: Open Source Computer Vision Library

-### Keep OpenCV Free
-
-OpenCV is raising funds to keep the library free for everyone, and we need the support of the entire community to do it. [Donate to OpenCV on IndieGoGo](http://igg.me/at/opencv5) before the campaign ends on December 16 to show your support.

 ### Resources

@ -13,6 +10,7 @@ OpenCV is raising funds to keep the library free for everyone, and we need the s
  * previous forum (read only): <http://answers.opencv.org>
 * Issue tracking: <https://github.com/opencv/opencv/issues>
 * Additional OpenCV functionality: <https://github.com/opencv/opencv_contrib>
+* Donate to OpenCV: <https://opencv.org/support/>


 ### Contributing
--- a/cmake/OpenCVCompilerOptimizations.cmake
+++ b/cmake/OpenCVCompilerOptimizations.cmake
@ -484,7 +484,6 @@ macro(ocv_check_compiler_optimization OPT)
 endmacro()

 macro(ocv_cpu_aarch64_baseline_merge_feature_options FEATURE_NAME_LIST FLAG_STRING COMMON_OPTION)
-  if(NOT MSVC)
  unset(_POSTFIX)
  # Check each feature option
  foreach(OPT IN LISTS ${FEATURE_NAME_LIST})
@ -499,7 +498,6 @@ macro(ocv_cpu_aarch64_baseline_merge_feature_options FEATURE_NAME_LIST FLAG_STRI
  if(NOT "x${_POSTFIX}" STREQUAL "x")
    set(${FLAG_STRING} "${${FLAG_STRING}} ${COMMON_OPTION}${_POSTFIX}")
  endif()
-  endif()
 endmacro()

 foreach(OPT ${CPU_KNOWN_OPTIMIZATIONS})
@ -596,10 +594,12 @@ foreach(OPT ${CPU_KNOWN_OPTIMIZATIONS})
 endforeach()

 if(AARCH64)
+  if(NOT MSVC)
    # Define the list of NEON options to check
    set(NEON_OPTIONS_LIST NEON_DOTPROD NEON_FP16 NEON_BF16)
    set(BASE_ARCHITECTURE "-march=armv8.2-a")
    ocv_cpu_aarch64_baseline_merge_feature_options(NEON_OPTIONS_LIST CPU_BASELINE_FLAGS ${BASE_ARCHITECTURE})
+  endif()
 endif()

 foreach(OPT ${CPU_BASELINE_REQUIRE})
--- a/cmake/OpenCVCompilerOptions.cmake
+++ b/cmake/OpenCVCompilerOptions.cmake
@ -1,13 +1,6 @@
 if("${CMAKE_CXX_COMPILER};${CMAKE_C_COMPILER};${CMAKE_CXX_COMPILER_LAUNCHER}" MATCHES "ccache")
-  set(CMAKE_COMPILER_IS_CCACHE 1)  # TODO: FIXIT Avoid setting of CMAKE_ variables
  set(OPENCV_COMPILER_IS_CCACHE 1)
 endif()
-function(access_CMAKE_COMPILER_IS_CCACHE)
-  if(NOT OPENCV_SUPPRESS_DEPRECATIONS)
-    message(WARNING "DEPRECATED: CMAKE_COMPILER_IS_CCACHE is replaced to OPENCV_COMPILER_IS_CCACHE.")
-  endif()
-endfunction()
-variable_watch(CMAKE_COMPILER_IS_CCACHE access_CMAKE_COMPILER_IS_CCACHE)
 if(ENABLE_CCACHE AND NOT OPENCV_COMPILER_IS_CCACHE)
  # This works fine with Unix Makefiles and Ninja generators
  find_host_program(CCACHE_PROGRAM ccache)
@ -391,7 +384,7 @@ endif()

 # Apply "-Wl,--no-undefined" linker flags: https://github.com/opencv/opencv/pull/21347
 if(NOT OPENCV_SKIP_LINK_NO_UNDEFINED)
-  if(UNIX AND (NOT APPLE OR NOT CMAKE_VERSION VERSION_LESS "3.2"))
+  if(UNIX AND ((NOT APPLE OR NOT CMAKE_VERSION VERSION_LESS "3.2") AND NOT CMAKE_SYSTEM_NAME MATCHES "OpenBSD"))
    set(_option "-Wl,--no-undefined")
    set(_saved_CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}")
    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${_option}")  # requires CMake 3.2+ and CMP0056
--- a/cmake/OpenCVDetectCUDA.cmake
+++ b/cmake/OpenCVDetectCUDA.cmake
@ -136,11 +136,11 @@ macro(ocv_check_windows_crt_linkage)
    cmake_policy(GET CMP0091 MSVC_RUNTIME_SET_BY_ABSTRACTION)
    if(MSVC_RUNTIME_SET_BY_ABSTRACTION STREQUAL "NEW")
      if(NOT BUILD_SHARED_LIBS AND BUILD_WITH_STATIC_CRT)
-        set(CMAKE_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE} " /MT")
-        set(CMAKE_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG} " /MTd")
+        set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /MT")
+        set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /MTd")
      else()
-        set(CMAKE_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE} " /MD")
-        set(CMAKE_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG} " /MDd")
+        set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /MD")
+        set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /MDd")
      endif()
    endif()
  endif()
--- a/cmake/OpenCVDetectPython.cmake
+++ b/cmake/OpenCVDetectPython.cmake
@ -270,6 +270,18 @@ find_python("${OPENCV_PYTHON3_VERSION}" "${MIN_VER_PYTHON3}" PYTHON3_LIBRARY PYT
    PYTHON3_INCLUDE_DIR PYTHON3_INCLUDE_DIR2 PYTHON3_PACKAGES_PATH
    PYTHON3_NUMPY_INCLUDE_DIRS PYTHON3_NUMPY_VERSION)

+# Problem in numpy >=1.15 <1.17
+OCV_OPTION(PYTHON3_LIMITED_API "Build with Python Limited API (not available with numpy >=1.15 <1.17)" NO
+           VISIBLE_IF PYTHON3_NUMPY_VERSION VERSION_LESS "1.15" OR NOT PYTHON3_NUMPY_VERSION VERSION_LESS "1.17")
+if(PYTHON3_LIMITED_API)
+  set(_default_ver "0x03060000")
+  if(PYTHON3_VERSION_STRING VERSION_LESS "3.6")
+    # fix for older pythons
+    set(_default_ver "0x030${PYTHON3_VERSION_MINOR}0000")
+  endif()
+  set(PYTHON3_LIMITED_API_VERSION ${_default_ver} CACHE STRING "Minimal Python version for Limited API")
+endif()
+
 if(PYTHON_DEFAULT_EXECUTABLE)
    set(PYTHON_DEFAULT_AVAILABLE "TRUE")
 elseif(PYTHON3_EXECUTABLE AND PYTHON3INTERP_FOUND)
--- a/cmake/OpenCVFindCANN.cmake
+++ b/cmake/OpenCVFindCANN.cmake
@ -57,6 +57,18 @@ if(CANN_INSTALL_DIR)
        set(HAVE_CANN OFF)
        return()
    endif()
+
+    #  * libacl_dvpp_mpi.so
+    set(libacl_dvpp_mpi "${CANN_INSTALL_DIR}/lib64")
+    find_library(found_libacldvppmpi NAMES acl_dvpp_mpi PATHS ${libacl_dvpp_mpi} NO_DEFAULT_PATH)
+    if(found_libacldvppmpi)
+        set(libacl_dvpp_mpi ${found_libacldvppmpi})
+        message(STATUS "CANN: libacl_dvpp_mpi.so is found at ${libacl_dvpp_mpi}")
+    else()
+        message(STATUS "CANN: Missing libacl_dvpp_mpi.so. Turning off HAVE_CANN")
+        set(HAVE_CANN OFF)
+        return()
+    endif()
    #  * libgraph.so
    set(lib_graph "${CANN_INSTALL_DIR}/compiler/lib64")
    find_library(found_lib_graph NAMES graph PATHS ${lib_graph} NO_DEFAULT_PATH)
@ -105,6 +117,7 @@ if(CANN_INSTALL_DIR)
    list(APPEND libs_cann ${lib_opsproto})
    list(APPEND libs_cann ${lib_graph})
    list(APPEND libs_cann ${lib_ge_compiler})
+    list(APPEND libs_cann ${libacl_dvpp_mpi})

    #  * lib_graph_base.so
    if(NOT CANN_VERSION_BELOW_6_3_ALPHA002)
--- a/cmake/android/android_gradle_projects.cmake
+++ b/cmake/android/android_gradle_projects.cmake
@ -89,15 +89,11 @@ else()
  ocv_update(OPENCV_ANDROID_NAMESPACE_DECLARATION "")
 endif()

-# set android gradle java version in build.gradle and set aidl config
 if(NOT (ANDROID_GRADLE_PLUGIN_VERSION VERSION_LESS "8.0.0"))
  # AGP-8.0 requires a minimum JDK version of JDK17
  ocv_update(ANDROID_GRADLE_JAVA_VERSION_INIT "17")
-  # Enable aidl configuration for OpenCV compile with AGP-8.0
-  ocv_update(ANDROID_GRADLE_BUILD_FEATURE_AIDL "buildFeatures { aidl true }")
 else()
  ocv_update(ANDROID_GRADLE_JAVA_VERSION_INIT "1_8")
-  ocv_update(ANDROID_GRADLE_BUILD_FEATURE_AIDL "")
 endif()

 set(ANDROID_GRADLE_JAVA_VERSION "${ANDROID_GRADLE_JAVA_VERSION_INIT}" CACHE STRING "Android Gradle Java version")
--- a/doc/tutorials/core/how_to_use_OpenCV_parallel_for_/how_to_use_OpenCV_parallel_for_.markdown
+++ b/doc/tutorials/core/how_to_use_OpenCV_parallel_for_/how_to_use_OpenCV_parallel_for_.markdown
@ -9,6 +9,9 @@ How to use the OpenCV parallel_for_ to parallelize your code {#tutorial_how_to_u
 | -: | :- |
 | Compatibility | OpenCV >= 3.0 |

+
+@note See also C++ lambda usage with parallel for in [tuturial](@ref tutorial_how_to_use_OpenCV_parallel_for_new).
+
 Goal
 ----

@ -20,7 +23,7 @@ If you want more information about multithreading, you will have to refer to a r
 to remain simple.

 Precondition
----
+------------

 The first precondition is to have OpenCV built with a parallel framework.
 In OpenCV 3.2, the following parallel frameworks are available in that order:
@ -50,7 +53,7 @@ We will use the example of drawing a Mandelbrot set to show how from a regular s
 the code to parallelize the computation.

 Theory
-----------
+------

 The Mandelbrot set definition has been named in tribute to the mathematician Benoit Mandelbrot by the mathematician
 Adrien Douady. It has been famous outside of the mathematics field as the image representation is an example of a
@ -69,7 +72,7 @@ Here, we will just introduce the formula to draw the Mandelbrot set (from the me
 > \f[\limsup_{n\to\infty}|z_{n+1}|\leqslant2\f]

 Pseudocode
-----------
+----------

 A simple algorithm to generate a representation of the Mandelbrot set is called the
 ["escape time algorithm"](https://en.wikipedia.org/wiki/Mandelbrot_set#Escape_time_algorithm).
@ -110,10 +113,10 @@ On this figure, we recall that the real part of a complex number is on the x-axi
 You can see that the whole shape can be repeatedly visible if we zoom at particular locations.

 Implementation
-----------
+--------------

 Escape time algorithm implementation
--------------------------
+------------------------------------

@snippet how_to_use_OpenCV_parallel_for_.cpp mandelbrot-escape-time-algorithm

@ -121,7 +124,7 @@ Here, we used the [`std::complex`](http://en.cppreference.com/w/cpp/numeric/comp
 complex number. This function performs the test to check if the pixel is in set or not and returns the "escaped" iteration.

 Sequential Mandelbrot implementation
--------------------------
+------------------------------------

@snippet how_to_use_OpenCV_parallel_for_.cpp mandelbrot-sequential

@ -149,7 +152,7 @@ The green curve corresponds to a simple linear scale transformation, the blue on
 and you can observe how the lowest values will be boosted when looking at the slope at these positions.

 Parallel Mandelbrot implementation
--------------------------
+----------------------------------

 When looking at the sequential implementation, we can notice that each pixel is computed independently. To optimize the
 computation, we can perform multiple pixel calculations in parallel, by exploiting the multi-core architecture of modern
@ -181,7 +184,7 @@ C++ 11 standard allows to simplify the parallel implementation by get rid of the
@snippet how_to_use_OpenCV_parallel_for_.cpp mandelbrot-parallel-call-cxx11

 Results
-----------
+-------

 You can find the full tutorial code [here](https://github.com/opencv/opencv/blob/5.x/samples/cpp/tutorial_code/core/how_to_use_OpenCV_parallel_for_/how_to_use_OpenCV_parallel_for_.cpp).
 The performance of the parallel implementation depends of the type of CPU you have. For instance, on 4 cores / 8 threads
--- a/doc/tutorials/introduction/android_binary_package/dev_with_OCV_on_Android.markdown
+++ b/doc/tutorials/introduction/android_binary_package/dev_with_OCV_on_Android.markdown
@ -18,7 +18,7 @@ This tutorial assumes you have the following installed and configured:
 -   Android Studio
 -   JDK
 -   Android SDK and NDK
-   OpenCV for Android SDK from official [release page on Github](https://github.com/opencv/opencv/releases)
+-   Optional: OpenCV for Android SDK from official [release page on Github](https://github.com/opencv/opencv/releases)
    or [SourceForge](https://sourceforge.net/projects/opencvlibrary/). Advanced: as alternative the SDK may be
    built from source code by [instruction on wiki](https://github.com/opencv/opencv/wiki/Custom-OpenCV-Android-SDK-and-AAR-package-build).

@ -26,8 +26,9 @@ If you need help with anything of the above, you may refer to our @ref tutorial_

 If you encounter any error after thoroughly following these steps, feel free to contact us via OpenCV [forum](https://forum.opencv.org). We'll do our best to help you out.

-Hello OpenCV sample
-------------------
+
+Hello OpenCV sample with SDK
+----------------------------

 In this section we're gonna create a simple app that does nothing but OpenCV loading. In next section we'll extend it to support camera.

@ -75,11 +76,10 @@ In addition to this instruction you can use some video guide, for example [this
    @endcode
    The fix was found [here](https://stackoverflow.com/questions/73225714/import-opencv-sdk-to-android-studio-chipmunk)

-6. OpenCV project uses `aidl` and `buildConfig` features. Please enable them in
+6. OpenCV project uses `buildConfig` feature. Please enable it in
   `MyApplication/OpenCV/build.gradle` file to `android` block:
    @code{.gradle}
    buildFeatures{
-        aidl true
        buildConfig true
    }

@ -115,6 +115,43 @@ In addition to this instruction you can use some video guide, for example [this

    ![](images/run_app.png)

+Hello OpenCV sample with Maven Central
+--------------------------------------
+
+Since OpenCV 4.9.0 OpenCV for Android package is available with Maven Central and may be installed
+automatically as Gradle dependency. In this section we're gonna create a simple app that does nothing
+but OpenCV loading with Maven Central.
+
+1. Open Android Studio and create empty project by choosing ***Empty Views Activity***
+
+    ![](images/create_empty_project.png)
+
+2. Setup the project:
+    - Choose ***Java*** language
+    - Choose ***Groovy DSL*** build configuration language
+    - Choose ***Minumum SDK*** with the version number not less than OpenCV supports. For 4.9.0 minimal SDK version is 21.
+
+    ![](images/setup_project.png)
+
+3. Edit `build.gradle` and add OpenCV library to Dependencies list like this:
+    @code{.gradle}
+    dependencies {
+        implementation 'org.opencv:opencv:4.9.0'
+    }
+    @endcode
+   `4.9.0` may be replaced by any version available as [official release](https://central.sonatype.com/artifact/org.opencv/opencv).
+
+4. Before using any OpenCV function you have to load the library first. If you application includes other
+   OpenCV-dependent native libraries you should load them ***after*** OpenCV initialization. Add the folowing
+   code to load the library at app start:
+    @snippet samples/android/tutorial-1-camerapreview/src/org/opencv/samples/tutorial1/Tutorial1Activity.java ocv_loader_init
+    Like this:
+    ![](images/sample_code.png)
+
+5. Choose a device to check the sample on and run the code by pressing `run` button
+
+    ![](images/run_app.png)
+
 Camera view sample
 ------------------

--- a/doc/tutorials/introduction/windows_install/windows_install.markdown
+++ b/doc/tutorials/introduction/windows_install/windows_install.markdown
@ -378,6 +378,9 @@ our OpenCV library that we use in our projects. Start up a command window and en

    setx OpenCV_DIR D:\OpenCV\build\x64\vc16     (suggested for Visual Studio 2019 - 64 bit Windows)
    setx OpenCV_DIR D:\OpenCV\build\x86\vc16     (suggested for Visual Studio 2019 - 32 bit Windows)
+
+    setx OpenCV_DIR D:\OpenCV\build\x64\vc17     (suggested for Visual Studio 2022 - 64 bit Windows)
+    setx OpenCV_DIR D:\OpenCV\build\x86\vc17     (suggested for Visual Studio 2022 - 32 bit Windows)
@endcode
 Here the directory is where you have your OpenCV binaries (*extracted* or *built*). You can have
 different platform (e.g. x64 instead of x86) or compiler type, so substitute appropriate value.
--- a/modules/core/include/opencv2/core/async.hpp
+++ b/modules/core/include/opencv2/core/async.hpp
@ -7,10 +7,8 @@

 #include <opencv2/core/mat.hpp>

-#ifdef CV_CXX11
 //#include <future>
 #include <chrono>
-#endif

 namespace cv {

@ -69,7 +67,6 @@ public:

    CV_WRAP bool valid() const CV_NOEXCEPT;

-#ifdef CV_CXX11
    inline AsyncArray(AsyncArray&& o) { p = o.p; o.p = NULL; }
    inline AsyncArray& operator=(AsyncArray&& o) CV_NOEXCEPT { std::swap(p, o.p); return *this; }

@ -89,7 +86,6 @@ public:
    std::future<Mat> getFutureMat() const;
    std::future<UMat> getFutureUMat() const;
 #endif
-#endif


    // PImpl
--- a/modules/core/include/opencv2/core/cv_cpu_dispatch.h
+++ b/modules/core/include/opencv2/core/cv_cpu_dispatch.h
@ -147,7 +147,7 @@
 #endif

 #if defined(__riscv) && defined(__riscv_vector) && defined(__riscv_vector_071)
-# include<riscv-vector.h>
+# include<riscv_vector.h>
 # define CV_RVV071 1
 #endif

--- a/modules/core/include/opencv2/core/cvdef.h
+++ b/modules/core/include/opencv2/core/cvdef.h
@ -476,6 +476,8 @@ Cv64suf;
 #define CV_WRAP_MAPPABLE(mappable)
 #define CV_WRAP_PHANTOM(phantom_header)
 #define CV_WRAP_DEFAULT(val)
+/* Indicates that the function parameter has filesystem path semantic */
+#define CV_WRAP_FILE_PATH

 /****************************************************************************************\
 *                                  Matrix type (Mat)                                     *
@ -755,89 +757,44 @@ __CV_ENUM_FLAGS_BITWISE_XOR_EQ   (EnumType, EnumType)
 #endif


-/****************************************************************************************\
-*                      CV_NODISCARD attribute (deprecated, GCC only)                     *
-* DONT USE: use instead the standard CV_NODISCARD_STD macro above                        *
-*           this legacy method silently fails to issue warning until some version        *
-*           after gcc 6.3.0. Yet with gcc 7+ you can use the above standard method       *
-*           which makes this method useless. Don't use it.                               *
-* @deprecated use instead CV_NODISCARD_STD                                               *
-\****************************************************************************************/
-#ifndef CV_NODISCARD
-#  if defined(__GNUC__)
-#    define CV_NODISCARD __attribute__((__warn_unused_result__))
-#  elif defined(__clang__) && defined(__has_attribute)
-#    if __has_attribute(__warn_unused_result__)
-#      define CV_NODISCARD __attribute__((__warn_unused_result__))
-#    endif
-#  endif
-#endif
-#ifndef CV_NODISCARD
-#  define CV_NODISCARD /* nothing by default */
-#endif
-
-
 /****************************************************************************************\
 *                                    C++ 11                                              *
 \****************************************************************************************/
-#ifndef CV_CXX11
-#  if __cplusplus >= 201103L || (defined(_MSC_VER) && _MSC_VER >= 1800)
-#    define CV_CXX11 1
-#  endif
-#else
-#  if CV_CXX11 == 0
-#    undef CV_CXX11
-#  endif
-#endif
-#ifndef CV_CXX11
+#ifdef __cplusplus
+// MSVC was stuck at __cplusplus == 199711L for a long time, even where it supports C++11,
+// so check _MSC_VER instead. See:
+// <https://devblogs.microsoft.com/cppblog/msvc-now-correctly-reports-__cplusplus>
+#  if defined(_MSC_VER)
+#    if _MSC_VER < 1800
 #      error "OpenCV 4.x+ requires enabled C++11 support"
+#    endif
+#  elif __cplusplus < 201103L
+#    error "OpenCV 4.x+ requires enabled C++11 support"
+#  endif
+#endif
+
+#ifndef CV_CXX11
+#  define CV_CXX11 1
 #endif

-#define CV_CXX_MOVE_SEMANTICS 1
-#define CV_CXX_MOVE(x) std::move(x)
-#define CV_CXX_STD_ARRAY 1
-#include <array>
 #ifndef CV_OVERRIDE
 #  define CV_OVERRIDE override
 #endif
+
 #ifndef CV_FINAL
 #  define CV_FINAL final
 #endif

 #ifndef CV_NOEXCEPT
-#  if __cplusplus >= 201103L || (defined(_MSC_VER) && _MSC_VER >= 1900/*MSVS 2015*/)
 #  define CV_NOEXCEPT noexcept
-#  endif
-#endif
-#ifndef CV_NOEXCEPT
-#  define CV_NOEXCEPT
 #endif

 #ifndef CV_CONSTEXPR
-#  if __cplusplus >= 201103L || (defined(_MSC_VER) && _MSC_VER >= 1900/*MSVS 2015*/)
 #  define CV_CONSTEXPR constexpr
-#  endif
-#endif
-#ifndef CV_CONSTEXPR
-#  define CV_CONSTEXPR
 #endif

 // Integer types portability
-#ifdef OPENCV_STDINT_HEADER
-#include OPENCV_STDINT_HEADER
-#elif defined(__cplusplus)
-#if defined(_MSC_VER) && _MSC_VER < 1600 /* MSVS 2010 */
-namespace cv {
-typedef signed char int8_t;
-typedef unsigned char uint8_t;
-typedef signed short int16_t;
-typedef unsigned short uint16_t;
-typedef signed int int32_t;
-typedef unsigned int uint32_t;
-typedef signed __int64 int64_t;
-typedef unsigned __int64 uint64_t;
-}
-#elif defined(_MSC_VER) || __cplusplus >= 201103L
+#ifdef __cplusplus
 #include <cstdint>
 namespace cv {
 using std::int8_t;
@ -849,19 +806,6 @@ using std::uint32_t;
 using std::int64_t;
 using std::uint64_t;
 }
-#else
-#include <stdint.h>
-namespace cv {
-typedef ::int8_t int8_t;
-typedef ::uint8_t uint8_t;
-typedef ::int16_t int16_t;
-typedef ::uint16_t uint16_t;
-typedef ::int32_t int32_t;
-typedef ::uint32_t uint32_t;
-typedef ::int64_t int64_t;
-typedef ::uint64_t uint64_t;
-}
-#endif
 #else // pure C
 #include <stdint.h>
 #endif
--- a/modules/core/include/opencv2/core/detail/async_promise.hpp
+++ b/modules/core/include/opencv2/core/detail/async_promise.hpp
@ -52,10 +52,8 @@ public:
    */
    void setException(const cv::Exception& exception);

-#ifdef CV_CXX11
    explicit AsyncPromise(AsyncPromise&& o) { p = o.p; o.p = NULL; }
    AsyncPromise& operator=(AsyncPromise&& o) CV_NOEXCEPT { std::swap(p, o.p); return *this; }
-#endif


    // PImpl
--- a/modules/core/include/opencv2/core/detail/exception_ptr.hpp
+++ b/modules/core/include/opencv2/core/detail/exception_ptr.hpp
@ -8,14 +8,8 @@
 #ifndef CV__EXCEPTION_PTR
 #  if defined(__ANDROID__) && defined(ATOMIC_INT_LOCK_FREE) && ATOMIC_INT_LOCK_FREE < 2
 #    define CV__EXCEPTION_PTR 0  // Not supported, details: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58938
-#  elif defined(CV_CXX11)
+#  else
 #    define CV__EXCEPTION_PTR 1
-#  elif defined(_MSC_VER)
-#    define CV__EXCEPTION_PTR (_MSC_VER >= 1600)
-#  elif defined(__clang__)
-#    define CV__EXCEPTION_PTR 0  // C++11 only (see above)
-#  elif defined(__GNUC__) && defined(__GXX_EXPERIMENTAL_CXX0X__)
-#    define CV__EXCEPTION_PTR (__GXX_EXPERIMENTAL_CXX0X__ > 0)
 #  endif
 #endif
 #ifndef CV__EXCEPTION_PTR
--- a/modules/core/include/opencv2/core/eigen.hpp
+++ b/modules/core/include/opencv2/core/eigen.hpp
@ -61,8 +61,7 @@
 #endif

 #if !defined(OPENCV_DISABLE_EIGEN_TENSOR_SUPPORT)
-#if EIGEN_WORLD_VERSION == 3 && EIGEN_MAJOR_VERSION >= 3 \
-    && defined(CV_CXX11) && defined(CV_CXX_STD_ARRAY)
+#if EIGEN_WORLD_VERSION == 3 && EIGEN_MAJOR_VERSION >= 3
 #include <unsupported/Eigen/CXX11/Tensor>
 #define OPENCV_EIGEN_TENSOR_SUPPORT 1
 #endif  // EIGEN_WORLD_VERSION == 3 && EIGEN_MAJOR_VERSION >= 3
--- a/modules/core/include/opencv2/core/hal/intrin_rvv071.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_rvv071.hpp
--- a/modules/core/include/opencv2/core/mat.hpp
+++ b/modules/core/include/opencv2/core/mat.hpp
@ -53,6 +53,7 @@

 #include "opencv2/core/bufferpool.hpp"

+#include <array>
 #include <type_traits>

 namespace cv
--- a/modules/core/include/opencv2/core/matx.hpp
+++ b/modules/core/include/opencv2/core/matx.hpp
@ -386,10 +386,8 @@ public:
    static Vec randn(_Tp a, _Tp b);
    static Vec randu(_Tp a, _Tp b);
    static Vec zeros();
-#ifdef CV_CXX11
    static Vec diag(_Tp alpha) = delete;
    static Vec eye() = delete;
-#endif

    //! per-element multiplication
    Vec mul(const Vec<_Tp, cn>& v) const;
@ -412,9 +410,7 @@ public:
    const _Tp& operator ()(int i) const;
    _Tp& operator ()(int i);

-#ifdef CV_CXX11
    Vec<_Tp, cn>& operator=(const Vec<_Tp, cn>& rhs) = default;
-#endif

    Vec(const Matx<_Tp, cn, 1>& a, const Matx<_Tp, cn, 1>& b, Matx_AddOp);
    Vec(const Matx<_Tp, cn, 1>& a, const Matx<_Tp, cn, 1>& b, Matx_SubOp);
--- a/modules/core/include/opencv2/core/quaternion.inl.hpp
+++ b/modules/core/include/opencv2/core/quaternion.inl.hpp
@ -28,7 +28,7 @@
 #define OPENCV_CORE_QUATERNION_INL_HPP

 #ifndef OPENCV_CORE_QUATERNION_HPP
-#erorr This is not a standalone header. Include quaternion.hpp instead.
+#error This is not a standalone header. Include quaternion.hpp instead.
 #endif

 //@cond IGNORE
--- a/modules/core/include/opencv2/core/utils/allocator_stats.impl.hpp
+++ b/modules/core/include/opencv2/core/utils/allocator_stats.impl.hpp
@ -9,8 +9,6 @@

 //#define OPENCV_DISABLE_ALLOCATOR_STATS

-#ifdef CV_CXX11
-
 #include <atomic>

 #ifndef OPENCV_ALLOCATOR_STATS_COUNTER_TYPE
@ -26,14 +24,6 @@
 #define OPENCV_ALLOCATOR_STATS_COUNTER_TYPE long long
 #endif

-#else  // CV_CXX11
-
-#ifndef OPENCV_ALLOCATOR_STATS_COUNTER_TYPE
-#define OPENCV_ALLOCATOR_STATS_COUNTER_TYPE int  // CV_XADD supports int only
-#endif
-
-#endif  // CV_CXX11
-
 namespace cv { namespace utils {

 #ifdef CV__ALLOCATOR_STATS_LOG
@ -59,7 +49,7 @@ public:
    void onAllocate(size_t /*sz*/) {}
    void onFree(size_t /*sz*/) {}

-#elif defined(CV_CXX11)
+#else

 protected:
    typedef OPENCV_ALLOCATOR_STATS_COUNTER_TYPE counter_t;
@ -104,49 +94,7 @@ public:
 #endif
        curr -= (counter_t)sz;
    }
-
-#else  // non C++11
-
-protected:
-    typedef OPENCV_ALLOCATOR_STATS_COUNTER_TYPE counter_t;
-    volatile counter_t curr, total, total_allocs, peak;  // overflow is possible, CV_XADD operates with 'int' only
-public:
-    AllocatorStatistics()
-        : curr(0), total(0), total_allocs(0), peak(0)
-    {}
-    ~AllocatorStatistics() CV_OVERRIDE {}
-
-    uint64_t getCurrentUsage() const CV_OVERRIDE { return (uint64_t)curr; }
-    uint64_t getTotalUsage() const CV_OVERRIDE { return (uint64_t)total; }
-    uint64_t getNumberOfAllocations() const CV_OVERRIDE { return (uint64_t)total_allocs; }
-    uint64_t getPeakUsage() const CV_OVERRIDE { return (uint64_t)peak; }
-
-    void resetPeakUsage() CV_OVERRIDE { peak = curr; }
-
-    // Controller interface
-    void onAllocate(size_t sz)
-    {
-#ifdef CV__ALLOCATOR_STATS_LOG
-        CV__ALLOCATOR_STATS_LOG(cv::format("allocate: %lld (curr=%lld)", (long long int)sz, (long long int)curr));
-#endif
-
-        counter_t new_curr = (counter_t)CV_XADD(&curr, (counter_t)sz) + (counter_t)sz;
-
-        peak = std::max((counter_t)peak, new_curr);  // non-thread safe
-
-        //CV_XADD(&total, (uint64_t)sz);  // overflow with int, non-reliable...
-        total += sz;
-
-        CV_XADD(&total_allocs, (counter_t)1);
-    }
-    void onFree(size_t sz)
-    {
-#ifdef CV__ALLOCATOR_STATS_LOG
-        CV__ALLOCATOR_STATS_LOG(cv::format("free: %lld (curr=%lld)", (long long int)sz, (long long int)curr));
-#endif
-        CV_XADD(&curr, (counter_t)-sz);
-    }
-#endif
+#endif // OPENCV_DISABLE_ALLOCATOR_STATS
 };

 #ifdef CV__ALLOCATOR_STATS_LOG
--- a/modules/core/src/async.cpp
+++ b/modules/core/src/async.cpp
@ -3,7 +3,6 @@
 // of this distribution and at http://opencv.org/license.html.

 #include "precomp.hpp"
-//#undef CV_CXX11  // debug non C++11 mode
 #include "opencv2/core/async.hpp"
 #include "opencv2/core/detail/async_promise.hpp"

@ -16,11 +15,9 @@

 #ifndef OPENCV_DISABLE_THREAD_SUPPORT

-#ifdef CV_CXX11
 #include <mutex>
 #include <condition_variable>
 #include <chrono>
-#endif

 namespace cv {

@ -37,12 +34,8 @@ struct AsyncArray::Impl
    void releasePromise() CV_NOEXCEPT { CV_XADD(&refcount_promise, -1); if(1 == CV_XADD(&refcount, -1)) delete this; } \
    int refcount_promise;

-#ifdef CV_CXX11
    mutable std::mutex mtx;
    mutable std::condition_variable cond_var;
-#else
-    mutable cv::Mutex mtx;
-#endif

    mutable bool has_result; // Mat, UMat or exception

@ -88,11 +81,7 @@ struct AsyncArray::Impl
            if (!wait_for(timeoutNs))
                return false;
        }
-#ifdef CV_CXX11
        std::unique_lock<std::mutex> lock(mtx);
-#else
-        cv::AutoLock lock(mtx);
-#endif
        if (has_result)
        {
            if (!result_mat.empty())
@ -145,7 +134,6 @@ struct AsyncArray::Impl
        if (timeoutNs == 0)
            return has_result;
        CV_LOG_INFO(NULL, "Waiting for async result ...");
-#ifdef CV_CXX11
        std::unique_lock<std::mutex> lock(mtx);
        const auto cond_pred = [&]{ return has_result == true; };
        if (timeoutNs > 0)
@ -156,9 +144,6 @@ struct AsyncArray::Impl
            CV_Assert(has_result);
            return true;
        }
-#else
-        CV_Error(Error::StsNotImplemented, "OpenCV has been built without async waiting support (C++11 is required)");
-#endif
    }

    AsyncArray getArrayResult()
@ -175,11 +160,7 @@ struct AsyncArray::Impl
    {
        if (future_is_returned && refcount_future == 0)
            CV_Error(Error::StsError, "Associated AsyncArray has been destroyed");
-#ifdef CV_CXX11
        std::unique_lock<std::mutex> lock(mtx);
-#else
-        cv::AutoLock lock(mtx);
-#endif
        CV_Assert(!has_result);
        int k = value.kind();
        if (k == _InputArray::UMAT)
@ -193,9 +174,7 @@ struct AsyncArray::Impl
            value.copyTo(*result_mat.get());
        }
        has_result = true;
-#ifdef CV_CXX11
        cond_var.notify_all();
-#endif
    }

 #if CV__EXCEPTION_PTR
@ -203,18 +182,12 @@ struct AsyncArray::Impl
    {
        if (future_is_returned && refcount_future == 0)
            CV_Error(Error::StsError, "Associated AsyncArray has been destroyed");
-#ifdef CV_CXX11
        std::unique_lock<std::mutex> lock(mtx);
-#else
-        cv::AutoLock lock(mtx);
-#endif
        CV_Assert(!has_result);
        has_exception = true;
        exception = e;
        has_result = true;
-#ifdef CV_CXX11
        cond_var.notify_all();
-#endif
    }
 #endif

@ -222,18 +195,12 @@ struct AsyncArray::Impl
    {
        if (future_is_returned && refcount_future == 0)
            CV_Error(Error::StsError, "Associated AsyncArray has been destroyed");
-#ifdef CV_CXX11
        std::unique_lock<std::mutex> lock(mtx);
-#else
-        cv::AutoLock lock(mtx);
-#endif
        CV_Assert(!has_result);
        has_exception = true;
        cv_exception = e;
        has_result = true;
-#ifdef CV_CXX11
        cond_var.notify_all();
-#endif
    }
 };

--- a/modules/core/src/matrix_wrap.cpp
+++ b/modules/core/src/matrix_wrap.cpp
@ -1952,12 +1952,7 @@ void _OutputArray::move(UMat& u) const
    int k = kind();
    if (k == UMAT)
    {
-#ifdef CV_CXX11
        *(UMat*)obj = std::move(u);
-#else
-        *(UMat*)obj = u;
-        u.release();
-#endif
    }
    else if (k == MAT)
    {
@ -1992,12 +1987,7 @@ void _OutputArray::move(Mat& m) const
    }
    else if (k == MAT)
    {
-#ifdef CV_CXX11
        *(Mat*)obj = std::move(m);
-#else
-        *(Mat*)obj = m;
-        m.release();
-#endif
    }
    else if (k == MATX)
    {
--- a/modules/core/src/parallel.cpp
+++ b/modules/core/src/parallel.cpp
@ -912,8 +912,7 @@ int getNumberOfCPUs_()
     * the minimum most value as it has high probablity of being right and safe.
     * Return 1 if we get 0 or not found on all methods.
    */
-#if defined CV_CXX11 \
-    && !defined(__MINGW32__) /* not implemented (2020-03) */ \
+#if !defined(__MINGW32__) /* not implemented (2020-03) */

    /*
     * Check for this standard C++11 way, we do not return directly because
--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@ -120,11 +120,15 @@ void* allocSingletonNewBuffer(size_t size) { return malloc(size); }
 #include <cstdlib>        // std::abort
 #endif

-#if defined __ANDROID__ || defined __unix__ || defined __FreeBSD__ || defined __OpenBSD__ || defined __HAIKU__ || defined __Fuchsia__
+#if defined __ANDROID__ || defined __unix__ || defined __FreeBSD__ || defined __OpenBSD__ || defined __HAIKU__ || defined __Fuchsia__ || defined __QNX__
 #  include <unistd.h>
 #  include <fcntl.h>
 #if defined __QNX__
 #  include <sys/elf.h>
+#  include <sys/auxv.h>
+using Elf64_auxv_t = auxv64_t;
+#  include <elfdefinitions.h>
+const uint64_t AT_HWCAP = NT_GNU_HWCAP;
 #else
 #  include <elf.h>
 #endif
@ -251,7 +255,7 @@ std::wstring GetTempFileNameWinRT(std::wstring prefix)
 #include "omp.h"
 #endif

-#if defined __unix__ || defined __APPLE__ || defined __EMSCRIPTEN__ || defined __FreeBSD__ || defined __GLIBC__ || defined __HAIKU__
+#if defined __unix__ || defined __APPLE__ || defined __EMSCRIPTEN__ || defined __FreeBSD__ || defined __OpenBSD__ || defined __GLIBC__ || defined __HAIKU__
 #include <unistd.h>
 #include <stdio.h>
 #include <sys/types.h>
@ -301,9 +305,7 @@ DECLARE_CV_CPUID_X86
  #endif
 #endif

-#if defined CV_CXX11
-  #include <chrono>
-#endif
+#include <chrono>

 namespace cv
 {
@ -562,7 +564,7 @@ struct HWFeatures
        }
    #endif // CV_CPUID_X86

-    #if defined __ANDROID__ || defined __linux__ || defined __FreeBSD__ || defined __QNX__
+    #if defined __ANDROID__ || defined __linux__ || defined __QNX__
    #ifdef __aarch64__
        have[CV_CPU_NEON] = true;
        have[CV_CPU_FP16] = true;
@ -581,10 +583,12 @@ struct HWFeatures
                    have[CV_CPU_NEON_DOTPROD] = (auxv.a_un.a_val & (1 << 20)) != 0; // HWCAP_ASIMDDP
                    have[CV_CPU_NEON_FP16] = (auxv.a_un.a_val & (1 << 10)) != 0; // HWCAP_ASIMDHP
                }
+#if defined(AT_HWCAP2)
                else if (auxv.a_type == AT_HWCAP2)
                {
                    have[CV_CPU_NEON_BF16] = (auxv.a_un.a_val & (1 << 14)) != 0; // HWCAP2_BF16
                }
+#endif
            }

            close(cpufile);
@ -611,7 +615,7 @@ struct HWFeatures
        CV_LOG_INFO(NULL, "- FP16 instructions is NOT enabled via build flags");
        #endif
      #endif
-    #elif defined __arm__ && !defined __FreeBSD__
+    #elif defined __arm__
        int cpufile = open("/proc/self/auxv", O_RDONLY);

        if (cpufile >= 0)
@ -903,50 +907,15 @@ bool useOptimized(void)

 int64 getTickCount(void)
 {
-#if defined CV_CXX11
    std::chrono::steady_clock::time_point now = std::chrono::steady_clock::now();
    return (int64)now.time_since_epoch().count();
-#elif defined _WIN32 || defined WINCE
-    LARGE_INTEGER counter;
-    QueryPerformanceCounter( &counter );
-    return (int64)counter.QuadPart;
-#elif defined __MACH__ && defined __APPLE__
-    return (int64)mach_absolute_time();
-#elif defined __unix__
-    struct timespec tp;
-    clock_gettime(CLOCK_MONOTONIC, &tp);
-    return (int64)tp.tv_sec*1000000000 + tp.tv_nsec;
-#else
-    struct timeval tv;
-    gettimeofday(&tv, NULL);
-    return (int64)tv.tv_sec*1000000 + tv.tv_usec;
-#endif
 }

 double getTickFrequency(void)
 {
-#if defined CV_CXX11
    using clock_period_t = std::chrono::steady_clock::duration::period;
    double clock_freq = clock_period_t::den / clock_period_t::num;
    return clock_freq;
-#elif defined _WIN32 || defined WINCE
-    LARGE_INTEGER freq;
-    QueryPerformanceFrequency(&freq);
-    return (double)freq.QuadPart;
-#elif defined __MACH__ && defined __APPLE__
-    static double freq = 0;
-    if( freq == 0 )
-    {
-        mach_timebase_info_data_t sTimebaseInfo;
-        mach_timebase_info(&sTimebaseInfo);
-        freq = sTimebaseInfo.denom*1e9/sTimebaseInfo.numer;
-    }
-    return freq;
-#elif defined __unix__
-    return 1e9;
-#else
-    return 1e6;
-#endif
 }

 #if defined __GNUC__ && (defined __i386__ || defined __x86_64__ || defined __ppc__)
--- a/modules/core/test/test_async.cpp
+++ b/modules/core/test/test_async.cpp
@ -7,7 +7,7 @@

 #include <opencv2/core/bindings_utils.hpp>

-#if defined(CV_CXX11) && !defined(OPENCV_DISABLE_THREAD_SUPPORT)
+#if !defined(OPENCV_DISABLE_THREAD_SUPPORT)
 #include <thread>
 #include <chrono>
 #endif
@ -85,7 +85,7 @@ TEST(Core_Async, LikePythonTest)
 }


-#if defined(CV_CXX11) && !defined(OPENCV_DISABLE_THREAD_SUPPORT)
+#if !defined(OPENCV_DISABLE_THREAD_SUPPORT)

 TEST(Core_Async, AsyncThread_Simple)
 {
--- a/modules/core/test/test_misc.cpp
+++ b/modules/core/test/test_misc.cpp
@ -8,10 +8,8 @@

 #include <opencv2/core/utils/fp_control_utils.hpp>

-#ifdef CV_CXX11
 #include <chrono>
 #include <thread>
-#endif

 namespace opencv_test { namespace {

@ -282,9 +280,7 @@ public:
            // FP state is not supported
            // no checks
        }
-#ifdef CV_CXX11
        std::this_thread::sleep_for(std::chrono::milliseconds(100));
-#endif
    }

    cv::details::FPDenormalsModeState base_state;
--- a/modules/core/test/test_precomp.hpp
+++ b/modules/core/test/test_precomp.hpp
@ -4,6 +4,8 @@
 #ifndef __OPENCV_TEST_PRECOMP_HPP__
 #define __OPENCV_TEST_PRECOMP_HPP__

+#include <array>
+
 #include "opencv2/ts.hpp"
 #include "opencv2/ts/ocl_test.hpp"
 #include "opencv2/core/private.hpp"
--- a/modules/core/test/test_utils_tls.impl.hpp
+++ b/modules/core/test/test_utils_tls.impl.hpp
@ -4,9 +4,7 @@

 // This is .hpp file included from test_utils.cpp

-#ifdef CV_CXX11
 #include <thread>  // std::thread
-#endif

 #include "opencv2/core/utils/tls.hpp"

@ -34,8 +32,6 @@ public:
 int TLSReporter::g_last_id = 0;
 int TLSReporter::g_allocated = 0;

-#ifdef CV_CXX11
-
 template<typename T>
 static void callNThreadsWithTLS(int N, TLSData<T>& tls)
 {
@ -129,6 +125,4 @@ static void testTLSAccumulator(bool detachFirst)
 TEST(Core_TLS, AccumulatorHoldData_detachData) { testTLSAccumulator(true); }
 TEST(Core_TLS, AccumulatorHoldData_gather) { testTLSAccumulator(false); }

-#endif
-
 }}  // namespace
--- a/modules/dnn/include/opencv2/dnn/all_layers.hpp
+++ b/modules/dnn/include/opencv2/dnn/all_layers.hpp
@ -1183,6 +1183,11 @@ CV__DNN_INLINE_NS_BEGIN
        static Ptr<AttentionLayer> create(const LayerParams &params);
    };

+    class CV_EXPORTS GroupNormLayer : public Layer {
+    public:
+        static Ptr<GroupNormLayer> create(const LayerParams &params);
+    };
+
 //! @}
 //! @}
 CV__DNN_INLINE_NS_END
--- a/modules/dnn/include/opencv2/dnn/dnn.hpp
+++ b/modules/dnn/include/opencv2/dnn/dnn.hpp
@ -444,7 +444,7 @@ CV__DNN_INLINE_NS_BEGIN
         *  Networks imported from Intel's Model Optimizer are launched in Intel's Inference Engine
         *  backend.
         */
-        CV_WRAP static Net readFromModelOptimizer(const String& xml, const String& bin);
+        CV_WRAP static Net readFromModelOptimizer(CV_WRAP_FILE_PATH const String& xml, CV_WRAP_FILE_PATH const String& bin);

        /** @brief Create a network from Intel's Model Optimizer in-memory buffers with intermediate representation (IR).
         *  @param[in] bufferModelConfig buffer with model's configuration.
@ -477,7 +477,7 @@ CV__DNN_INLINE_NS_BEGIN
         *  @param path   path to output file with .dot extension
         *  @see dump()
         */
-        CV_WRAP void dumpToFile(const String& path);
+        CV_WRAP void dumpToFile(CV_WRAP_FILE_PATH const String& path);
        /** @brief Adds new layer to the net.
         *  @param name   unique name of the adding layer.
         *  @param type   typename of the adding layer (type must be registered in LayerRegister).
@ -839,7 +839,7 @@ CV__DNN_INLINE_NS_BEGIN
    *  @param darknetModel path to the .weights file with learned network.
    *  @returns Network object that ready to do forward, throw an exception in failure cases.
    */
-    CV_EXPORTS_W Net readNetFromDarknet(const String &cfgFile, const String &darknetModel = String());
+    CV_EXPORTS_W Net readNetFromDarknet(CV_WRAP_FILE_PATH const String &cfgFile, CV_WRAP_FILE_PATH const String &darknetModel = String());

    /** @brief Reads a network model stored in <a href="https://pjreddie.com/darknet/">Darknet</a> model files.
     *  @param bufferCfg   A buffer contains a content of .cfg file with text description of the network architecture.
@ -864,7 +864,7 @@ CV__DNN_INLINE_NS_BEGIN
      * @param caffeModel path to the .caffemodel file with learned network.
      * @returns Net object.
      */
-    CV_EXPORTS_W Net readNetFromCaffe(const String &prototxt, const String &caffeModel = String());
+    CV_EXPORTS_W Net readNetFromCaffe(CV_WRAP_FILE_PATH const String &prototxt, CV_WRAP_FILE_PATH const String &caffeModel = String());

    /** @brief Reads a network model stored in Caffe model in memory.
      * @param bufferProto buffer containing the content of the .prototxt file
@ -893,7 +893,7 @@ CV__DNN_INLINE_NS_BEGIN
      *               let us make it more flexible.
      * @returns Net object.
      */
-    CV_EXPORTS_W Net readNetFromTensorflow(const String &model, const String &config = String());
+    CV_EXPORTS_W Net readNetFromTensorflow(CV_WRAP_FILE_PATH const String &model, CV_WRAP_FILE_PATH const String &config = String());

    /** @brief Reads a network model stored in <a href="https://www.tensorflow.org/">TensorFlow</a> framework's format.
      * @param bufferModel buffer containing the content of the pb file
@ -918,7 +918,7 @@ CV__DNN_INLINE_NS_BEGIN
      * @param model  path to the .tflite file with binary flatbuffers description of the network architecture
      * @returns Net object.
      */
-    CV_EXPORTS_W Net readNetFromTFLite(const String &model);
+    CV_EXPORTS_W Net readNetFromTFLite(CV_WRAP_FILE_PATH const String &model);

    /** @brief Reads a network model stored in <a href="https://www.tensorflow.org/lite">TFLite</a> framework's format.
      * @param bufferModel buffer containing the content of the tflite file
@ -957,7 +957,7 @@ CV__DNN_INLINE_NS_BEGIN
      * or @ref readNetFromDarknet. An order of @p model and @p config
      * arguments does not matter.
      */
-     CV_EXPORTS_W Net readNet(const String& model, const String& config = "", const String& framework = "");
+     CV_EXPORTS_W Net readNet(CV_WRAP_FILE_PATH const String& model, CV_WRAP_FILE_PATH const String& config = "", const String& framework = "");

     /**
      * @brief Read deep learning network represented in one of the supported formats.
@ -979,7 +979,7 @@ CV__DNN_INLINE_NS_BEGIN
     *  backend.
     */
    CV_EXPORTS_W
-    Net readNetFromModelOptimizer(const String &xml, const String &bin = "");
+    Net readNetFromModelOptimizer(CV_WRAP_FILE_PATH const String &xml, CV_WRAP_FILE_PATH const String &bin = "");

    /** @brief Load a network from Intel's Model Optimizer intermediate representation.
     *  @param[in] bufferModelConfig Buffer contains XML configuration with network's topology.
@ -1008,7 +1008,7 @@ CV__DNN_INLINE_NS_BEGIN
     *  @param onnxFile path to the .onnx file with text description of the network architecture.
     *  @returns Network object that ready to do forward, throw an exception in failure cases.
     */
-    CV_EXPORTS_W Net readNetFromONNX(const String &onnxFile);
+    CV_EXPORTS_W Net readNetFromONNX(CV_WRAP_FILE_PATH const String &onnxFile);

    /** @brief Reads a network model from <a href="https://onnx.ai/">ONNX</a>
     *         in-memory buffer.
@ -1031,7 +1031,7 @@ CV__DNN_INLINE_NS_BEGIN
     *  @param path to the .pb file with input tensor.
     *  @returns Mat.
     */
-    CV_EXPORTS_W Mat readTensorFromONNX(const String& path);
+    CV_EXPORTS_W Mat readTensorFromONNX(CV_WRAP_FILE_PATH const String& path);

    /** @brief Creates 4-dimensional blob from image. Optionally resizes and crops @p image from center,
     *  subtract @p mean values, scales values by @p scalefactor, swap Blue and Red channels.
@ -1204,7 +1204,7 @@ CV__DNN_INLINE_NS_BEGIN
     *       is taken from NVidia's Caffe fork: https://github.com/NVIDIA/caffe.
     *       So the resulting model may be used there.
     */
-    CV_EXPORTS_W void shrinkCaffeModel(const String& src, const String& dst,
+    CV_EXPORTS_W void shrinkCaffeModel(CV_WRAP_FILE_PATH const String& src, CV_WRAP_FILE_PATH const String& dst,
                                       const std::vector<String>& layersTypes = std::vector<String>());

    /** @brief Create a text representation for a binary network stored in protocol buffer format.
@ -1213,7 +1213,7 @@ CV__DNN_INLINE_NS_BEGIN
     *
     *  @note To reduce output file size, trained weights are not included.
     */
-    CV_EXPORTS_W void writeTextGraph(const String& model, const String& output);
+    CV_EXPORTS_W void writeTextGraph(CV_WRAP_FILE_PATH const String& model, CV_WRAP_FILE_PATH const String& output);

    /** @brief Performs non maximum suppression given boxes and corresponding scores.

@ -1318,7 +1318,7 @@ CV__DNN_INLINE_NS_BEGIN
          * @param[in] model Binary file contains trained weights.
          * @param[in] config Text file contains network configuration.
          */
-         CV_WRAP Model(const String& model, const String& config = "");
+         CV_WRAP Model(CV_WRAP_FILE_PATH const String& model, CV_WRAP_FILE_PATH const String& config = "");

         /**
          * @brief Create model from deep learning network.
@ -1423,7 +1423,7 @@ CV__DNN_INLINE_NS_BEGIN
          * @param[in] model Binary file contains trained weights.
          * @param[in] config Text file contains network configuration.
          */
-          CV_WRAP ClassificationModel(const String& model, const String& config = "");
+          CV_WRAP ClassificationModel(CV_WRAP_FILE_PATH const String& model, CV_WRAP_FILE_PATH const String& config = "");

         /**
          * @brief Create model from deep learning network.
@ -1473,7 +1473,7 @@ CV__DNN_INLINE_NS_BEGIN
          * @param[in] model Binary file contains trained weights.
          * @param[in] config Text file contains network configuration.
          */
-          CV_WRAP KeypointsModel(const String& model, const String& config = "");
+          CV_WRAP KeypointsModel(CV_WRAP_FILE_PATH const String& model, CV_WRAP_FILE_PATH const String& config = "");

         /**
          * @brief Create model from deep learning network.
@ -1505,7 +1505,7 @@ CV__DNN_INLINE_NS_BEGIN
          * @param[in] model Binary file contains trained weights.
          * @param[in] config Text file contains network configuration.
          */
-          CV_WRAP SegmentationModel(const String& model, const String& config = "");
+          CV_WRAP SegmentationModel(CV_WRAP_FILE_PATH const String& model, CV_WRAP_FILE_PATH const String& config = "");

         /**
          * @brief Create model from deep learning network.
@ -1536,7 +1536,7 @@ CV__DNN_INLINE_NS_BEGIN
          * @param[in] model Binary file contains trained weights.
          * @param[in] config Text file contains network configuration.
          */
-         CV_WRAP DetectionModel(const String& model, const String& config = "");
+         CV_WRAP DetectionModel(CV_WRAP_FILE_PATH const String& model, CV_WRAP_FILE_PATH const String& config = "");

         /**
          * @brief Create model from deep learning network.
@ -1602,7 +1602,7 @@ public:
     * @param[in] config Text file contains network configuration
     */
    CV_WRAP inline
-    TextRecognitionModel(const std::string& model, const std::string& config = "")
+    TextRecognitionModel(CV_WRAP_FILE_PATH const std::string& model, CV_WRAP_FILE_PATH const std::string& config = "")
        : TextRecognitionModel(readNet(model, config)) { /* nothing */ }

    /**
@ -1757,7 +1757,7 @@ public:
     * @param[in] config Text file contains network configuration.
     */
    CV_WRAP inline
-    TextDetectionModel_EAST(const std::string& model, const std::string& config = "")
+    TextDetectionModel_EAST(CV_WRAP_FILE_PATH const std::string& model, CV_WRAP_FILE_PATH const std::string& config = "")
        : TextDetectionModel_EAST(readNet(model, config)) { /* nothing */ }

    /**
@ -1818,7 +1818,7 @@ public:
     * @param[in] config Text file contains network configuration.
     */
    CV_WRAP inline
-    TextDetectionModel_DB(const std::string& model, const std::string& config = "")
+    TextDetectionModel_DB(CV_WRAP_FILE_PATH const std::string& model, CV_WRAP_FILE_PATH const std::string& config = "")
        : TextDetectionModel_DB(readNet(model, config)) { /* nothing */ }

    CV_WRAP TextDetectionModel_DB& setBinaryThreshold(float binaryThreshold);
--- a/modules/dnn/perf/perf_layer.cpp
+++ b/modules/dnn/perf/perf_layer.cpp
@ -258,22 +258,21 @@ PERF_TEST_P_(Layer_Slice, FastNeuralStyle_eccv16)
    test_slice<4>(inputShape, begin, end);
 }

-struct Layer_Scatter : public TestBaseWithParam<tuple<Backend, Target> >
-{
-    void test_layer(const std::vector<int>& shape, const String reduction = "none", int axis = 0)
-    {
-        int backendId = get<0>(GetParam());
-        int targetId = get<1>(GetParam());
+using Layer_Scatter = TestBaseWithParam<tuple<std::vector<int>, std::string, int, tuple<Backend, Target>>>;
+PERF_TEST_P_(Layer_Scatter, scatter) {
+    std::vector<int> shape = get<0>(GetParam());
+    std::string reduction = get<1>(GetParam());
+    int axis = get<2>(GetParam());
+    int backend_id = get<0>(get<3>(GetParam()));
+    int target_id = get<1>(get<3>(GetParam()));

    Mat data(shape, CV_32FC1);
    Mat indices(shape, CV_32FC1);
    Mat updates(shape, CV_32FC1);

-        Scalar mean = 0.f;
-        Scalar std = 1.f;
-        randn(data, mean, std);
+    randn(data, 0.f, 1.f);
    randu(indices, 0, shape[axis]);
-        randn(updates, mean, std);
+    randn(updates, 0.f, 1.f);

    indices.convertTo(indices, CV_32SC1, 1, -1);

@ -291,50 +290,46 @@ struct Layer_Scatter : public TestBaseWithParam<tuple<Backend, Target> >

    // warmup
    {
-            std::vector<String> inpNames(3);
-            inpNames[0] = "data";
-            inpNames[1] = "indices";
-            inpNames[2] = "updates";
-            net.setInputsNames(inpNames);
-            net.setInput(data, inpNames[0]);
-            net.setInput(indices, inpNames[1]);
-            net.setInput(updates, inpNames[2]);
+        std::vector<String> input_names{"data", "indices", "updates"};
+        net.setInputsNames(input_names);
+        net.setInput(data, input_names[0]);
+        net.setInput(indices, input_names[1]);
+        net.setInput(updates, input_names[2]);

-            net.setPreferableBackend(backendId);
-            net.setPreferableTarget(targetId);
+        net.setPreferableBackend(backend_id);
+        net.setPreferableTarget(target_id);
        Mat out = net.forward();
    }

+    // perf
    TEST_CYCLE()
    {
        Mat res = net.forward();
    }

    SANITY_CHECK_NOTHING();
-    }
-
-    int N = 8;
-    int C = 256;
-    int H = 128;
-    int W = 100;
-};
-
-PERF_TEST_P_(Layer_Scatter, DISABLED_Scatter)
-{
-    test_layer({N, C, H, W});
 }

-PERF_TEST_P_(Layer_Scatter, DISABLED_Scatter_add)
-{
-    test_layer({N, C, H, W}, "add");
-}
+INSTANTIATE_TEST_CASE_P(/**/, Layer_Scatter, Combine(
+    Values(std::vector<int>{2, 128, 64, 50}),
+    Values(std::string("none"), std::string("add")),
+    Values(0), // use Values(0, 1, 2, 3) for more details
+    dnnBackendsAndTargets(/* withInferenceEngine= */ false,
+                          /* withHalide= */          false,
+                          /* withCpuOCV= */          true,
+                          /* withVkCom= */           false,
+                          /* withCUDA= */            false,
+                          /* withNgraph= */          false,
+                          /* withWebnn= */           false,
+                          /* withCann= */            false) // only test on CPU
+));

-struct Layer_ScatterND : public TestBaseWithParam<tuple<Backend, Target> >
-{
-    void test_layer(const std::vector<int>& shape, const String reduction = "none")
-    {
-        int backendId = get<0>(GetParam());
-        int targetId = get<1>(GetParam());
+using Layer_ScatterND = TestBaseWithParam<tuple<std::vector<int>, std::string, tuple<Backend, Target>>>;
+PERF_TEST_P_(Layer_ScatterND, scatterND) {
+    std::vector<int> shape = get<0>(GetParam());
+    std::string reduction = get<1>(GetParam());
+    int backend_id = get<0>(get<2>(GetParam()));
+    int target_id = get<1>(get<2>(GetParam()));

    std::vector<int> indices_shape(shape);
    indices_shape.push_back(int(shape.size()));
@ -342,12 +337,10 @@ struct Layer_ScatterND : public TestBaseWithParam<tuple<Backend, Target> >
    Mat indices(indices_shape, CV_32FC1);
    Mat updates(shape, CV_32FC1);

-        Scalar mean = 0.f;
-        Scalar std = 1.f;
-        randn(data, mean, std);
-        randn(updates, mean, std);
+    randn(data, 0.f, 1.f);
+    randn(updates, 0.f, 1.f);

-        // initialize the indices with index tuples like [0...N, 0...C, 0...H, 0...W]
+    // Create indices such that indices[n_i, c_j, h_k, w_l, :4] = [i, j, k, l]
    std::vector<int> current_index_tuple(shape.size());
    int total = data.total();
    std::vector<int> indices_step;
@ -357,6 +350,7 @@ struct Layer_ScatterND : public TestBaseWithParam<tuple<Backend, Target> >
        indices_step.push_back(step);
    }
    int t, j, idx, offset_at_idx, offset;
+    auto *indices_ptr = indices.ptr<float>();
    for (int i = 0; i < total; i++)
    {
        t = i;
@ -373,7 +367,7 @@ struct Layer_ScatterND : public TestBaseWithParam<tuple<Backend, Target> >
            offset += current_index_tuple[j] * indices_step[j];

        for (j = 0; j < shape.size(); j++)
-                indices.at<float>(offset + j) = current_index_tuple[j];
+            indices_ptr[offset + j] = current_index_tuple[j];
    }

    Net net;
@ -389,17 +383,14 @@ struct Layer_ScatterND : public TestBaseWithParam<tuple<Backend, Target> >

    // warmup
    {
-            std::vector<String> inpNames(3);
-            inpNames[0] = "data";
-            inpNames[1] = "indices";
-            inpNames[2] = "updates";
-            net.setInputsNames(inpNames);
-            net.setInput(data, inpNames[0]);
-            net.setInput(indices, inpNames[1]);
-            net.setInput(updates, inpNames[2]);
+        std::vector<String> input_names{"data", "indices", "updates"};
+        net.setInputsNames(input_names);
+        net.setInput(data, input_names[0]);
+        net.setInput(indices, input_names[1]);
+        net.setInput(updates, input_names[2]);

-            net.setPreferableBackend(backendId);
-            net.setPreferableTarget(targetId);
+        net.setPreferableBackend(backend_id);
+        net.setPreferableTarget(target_id);
        Mat out = net.forward();
    }

@ -409,23 +400,20 @@ struct Layer_ScatterND : public TestBaseWithParam<tuple<Backend, Target> >
    }

    SANITY_CHECK_NOTHING();
-    }
-
-    int N = 8;
-    int C = 256;
-    int H = 128;
-    int W = 100;
-};
-
-PERF_TEST_P_(Layer_ScatterND, DISABLED_ScatterND)
-{
-    test_layer({N, C, H ,W});
 }

-PERF_TEST_P_(Layer_ScatterND, DISABLED_ScatterND_add)
-{
-    test_layer({N, C, H , W}, "add");
-}
+INSTANTIATE_TEST_CASE_P(/**/, Layer_ScatterND, Combine(
+    Values(std::vector<int>{2, 128, 64, 50}),
+    Values(std::string("none"), std::string("add")),
+    dnnBackendsAndTargets(/* withInferenceEngine= */ false,
+                          /* withHalide= */          false,
+                          /* withCpuOCV= */          true,
+                          /* withVkCom= */           false,
+                          /* withCUDA= */            false,
+                          /* withNgraph= */          false,
+                          /* withWebnn= */           false,
+                          /* withCann= */            false) // only test on CPU
+));

 struct Layer_LayerNorm : public TestBaseWithParam<tuple<Backend, Target> >
 {
@ -795,19 +783,77 @@ PERF_TEST_P_(Layer_Attention, VisionTransformer) {
    test_layer({1, 197, 768}, {768, 768, 768}, 12);
 }

+struct Layer_GroupNorm : public TestBaseWithParam<tuple<Backend, Target> >
+{
+    void test_layer(const std::vector<int>& x_shape, int num_groups)
+    {
+        int backendId = get<0>(GetParam());
+        int targetId = get<1>(GetParam());
+
+        Mat x(x_shape, CV_32FC1);
+        Mat scale(x_shape[1], 1, CV_32FC1);
+        Mat b(x_shape[1], 1, CV_32FC1);
+
+        randu(x, 0.f, 1.f);
+        randu(scale, 0.f, 1.f);
+        randu(b, 0.f, 1.f);
+
+        Net net;
+        LayerParams lp;
+        lp.type = "GroupNormalization";
+        lp.name = "testLayer";
+        lp.set("num_groups", num_groups);
+
+        int id = net.addLayerToPrev(lp.name, lp.type, lp);
+        net.connect(0, 0, id, 0);
+        net.connect(0, 1, id, 1);
+        net.connect(0, 2, id, 2);
+
+        // warmup
+        {
+            std::vector<String> inpNames{"x", "scale", "b"};
+            net.setInputsNames(inpNames);
+            net.setInput(x, inpNames[0]);
+            net.setInput(scale, inpNames[1]);
+            net.setInput(b, inpNames[2]);
+
+            net.setPreferableBackend(backendId);
+            net.setPreferableTarget(targetId);
+            Mat out = net.forward();
+        }
+
+        TEST_CYCLE()
+        {
+            Mat res = net.forward();
+        }
+
+        SANITY_CHECK_NOTHING();
+    }
+
+    int N = 2;
+    int C = 64;
+    int H = 180;
+    int W = 240;
+    int num_groups = 16;
+};
+
+PERF_TEST_P_(Layer_GroupNorm, GroupNorm)
+{
+    test_layer({N, C, H, W}, num_groups);
+}
+
+
 INSTANTIATE_TEST_CASE_P(/**/, Layer_Slice, dnnBackendsAndTargets(false, false));
 INSTANTIATE_TEST_CASE_P(/**/, Layer_NaryEltwise, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU)));
 #ifdef HAVE_CUDA
 INSTANTIATE_TEST_CASE_P(CUDA, Layer_NaryEltwise, testing::Values(std::make_tuple(DNN_BACKEND_CUDA, DNN_TARGET_CUDA)));
 #endif
-INSTANTIATE_TEST_CASE_P(/**/, Layer_Scatter, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU)));
-INSTANTIATE_TEST_CASE_P(/**/, Layer_ScatterND, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU)));
 INSTANTIATE_TEST_CASE_P(/**/, Layer_LayerNorm, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU)));
 INSTANTIATE_TEST_CASE_P(/**/, Layer_LayerNormExpanded, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU)));
 INSTANTIATE_TEST_CASE_P(/**/, Layer_GatherElements, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU)));
 INSTANTIATE_TEST_CASE_P(/**/, Layer_InstanceNorm, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU)));
 INSTANTIATE_TEST_CASE_P(/**/, Layer_Attention, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU)));
-
+INSTANTIATE_TEST_CASE_P(/**/, Layer_GroupNorm, testing::Values(std::make_tuple(DNN_BACKEND_OPENCV, DNN_TARGET_CPU)));

 typedef TestBaseWithParam<tuple<Vec4i, int, bool, tuple<Backend, Target> > > Layer_FullyConnected;
 PERF_TEST_P_(Layer_FullyConnected, fc)
--- a/modules/dnn/src/cuda/eltwise_ops.cu
+++ b/modules/dnn/src/cuda/eltwise_ops.cu
@ -132,8 +132,23 @@ void eltwise_op(const Stream& stream, TensorSpan<T> output, TensorView<T> x, Ten
    }
    else
    {
-        CV_Assert(is_shape_compatible(output, x));
-        CV_Assert(is_shape_compatible(output, y));
+        auto inShape1 = x.shape_as_vector();
+        auto inShape2 = y.shape_as_vector();
+        auto outShape = output.shape_as_vector();
+
+        std::size_t x_ndims = inShape1.size(), y_ndims = inShape2.size();
+        if (x_ndims >= y_ndims) {
+            for (std::size_t i = 0; i < (x_ndims - y_ndims); i++) {
+               inShape2.insert(inShape2.begin(), 1);
+            }
+        } else {
+            for (std::size_t i = 0; i < (y_ndims - x_ndims); i++) {
+               inShape1.insert(inShape1.begin(), 1);
+            }
+        }
+
+        CV_Assert(is_shape_compatible1(outShape, inShape1));
+        CV_Assert(is_shape_compatible1(outShape, inShape2));

        /* matching singleton axes in both input tensors can be eliminated
         *
@ -148,20 +163,21 @@ void eltwise_op(const Stream& stream, TensorSpan<T> output, TensorView<T> x, Ten
         * x: [1, 256, 32, 32] -> [256, 32, 32]
         * y: [1, 256, 1, 1] -> [256, 1, 1]
         */
-        for (int r = 0; r < output.rank(); r++)
-        {
-            while (x.rank() > r && y.rank() > r && x.get_axis_size(r) == 1 && y.get_axis_size(r) == 1) {
-                CV_Assert(output.get_axis_size(r) == 1);
-
-                x.squeeze(r);
-                y.squeeze(r);
-                output.squeeze(r);
+        int eliminate_times = 0;
+        for (std::size_t i = 0; i < outShape.size(); i++) {
+            if (inShape1[i] == 1 && inShape2[i] == 1 && outShape[i] == 1 && i != (outShape.size() - 1)) {
+                eliminate_times++;
+            } else {
+                break;
+            }
+        }
+        if (eliminate_times > 0) {
+            for (int i = 0; i < eliminate_times; i++) {
+                inShape1.erase(inShape1.begin());
+                inShape2.erase(inShape2.begin());
+                outShape.erase(outShape.begin());
            }
        }
-
-        auto inShape1 = x.shape_as_vector();
-        auto inShape2 = y.shape_as_vector();
-        auto outShape = output.shape_as_vector();

        /* contiguous axes that do not broadcast can be merged into one axis
         *
@ -324,7 +340,19 @@ void eltwise_sub_2(const Stream& stream, TensorSpan<T> output, TensorView<T> x,
    eltwise_op<T, SubFunctor<T>>(stream, output, x, y);
 }

+template <class T>
+void eltwise_mod_2(const Stream& stream, TensorSpan<T> output, TensorView<T> x, TensorView<T> y) {
+    eltwise_op<T, ModFunctor<T>>(stream, output, x, y);
+}
+
+template <class T>
+void eltwise_fmod_2(const Stream& stream, TensorSpan<T> output, TensorView<T> x, TensorView<T> y) {
+    eltwise_op<T, FModFunctor<T>>(stream, output, x, y);
+}
+
 #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template void eltwise_mod_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y);
+    template void eltwise_fmod_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y);
    template void eltwise_sub_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y);
    template void eltwise_div_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y);
    template void eltwise_prod_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y);
@ -333,6 +361,8 @@ void eltwise_sub_2(const Stream& stream, TensorSpan<T> output, TensorView<T> x,
    template void eltwise_max_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y);
    template void eltwise_min_2(const Stream& stream, TensorSpan<__half> output, TensorView<__half> x, TensorView<__half> y);
 #endif
+    template void eltwise_mod_2(const Stream& stream, TensorSpan<float> output, TensorView<float> x, TensorView<float> y);
+    template void eltwise_fmod_2(const Stream& stream, TensorSpan<float> output, TensorView<float> x, TensorView<float> y);
    template void eltwise_sub_2(const Stream& stream, TensorSpan<float> output, TensorView<float> x, TensorView<float> y);
    template void eltwise_div_2(const Stream& stream, TensorSpan<float> output, TensorView<float> x, TensorView<float> y);
    template void eltwise_prod_2(const Stream& stream, TensorSpan<float> output, TensorView<float> x, TensorView<float> y);
--- a/modules/dnn/src/cuda/functors.hpp
+++ b/modules/dnn/src/cuda/functors.hpp
@ -799,6 +799,40 @@ struct ReciprocalFunctor {
    }
 };

+template <class T>
+struct ModFunctor {
+    struct Params {
+        CUDA4DNN_HOST_DEVICE Params() {}
+    };
+
+    CUDA4DNN_DEVICE ModFunctor() { }
+    CUDA4DNN_DEVICE ModFunctor(const Params& params) { }
+
+    CUDA4DNN_DEVICE T operator()(T x, T y) {
+        int res = (int)x % (int)y;
+        T zero = T(0);
+        if ((res > (int)zero && y < zero) || (res < (int)zero && y > zero)) {
+            res += (int)y;
+        }
+        return res;
+    }
+};
+
+template <class T>
+struct FModFunctor {
+    struct Params {
+        CUDA4DNN_HOST_DEVICE Params() {}
+    };
+
+    CUDA4DNN_DEVICE FModFunctor() { }
+    CUDA4DNN_DEVICE FModFunctor(const Params& params) { }
+
+    CUDA4DNN_DEVICE T operator()(T x, T y) {
+        using csl::device::fmod;
+        return fmod(x, y);
+    }
+};
+
 }}}} /* namespace cv::dnn::cuda4dnn::kernels */

 #endif /* OPENCV_DNN_SRC_CUDA_FUNCTORS_HPP */
--- a/modules/dnn/src/cuda/math.hpp
+++ b/modules/dnn/src/cuda/math.hpp
@ -36,6 +36,13 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace de
    template <> inline __device__ float min(float x, float y) { return fminf(x, y); }
    template <> inline __device__ double min(double x, double y) { return fmin(x, y); }

+    template <class T> __device__ T fmod(T x, T y) { return x % y; }
+    template <> inline __device__ float fmod(float x, float y) { return fmodf(x, y); }
+    template <> inline __device__ double fmod(double x, double y) { return fmod(x, y); }
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+    template <> inline __device__ half fmod(half x, half y) { return fmodf((float)x, (float)y); }
+#endif
+
    template <class T> __device__ T log1p(T val);
 #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
    template <> inline __device__ __half log1p(__half val) { return hlog(__half(1) + val); }
--- a/modules/dnn/src/cuda/mvn.cu
+++ b/modules/dnn/src/cuda/mvn.cu
@ -78,6 +78,18 @@ namespace raw {
        }
    }

+    template <class T>
+    __global__ void normalize_mean_variance_groupwise(Span<T> output, View<T> input, View<T> scale, View<T> bias, View<float> means, View<float> inv_stddev, size_type inner_size, size_type C, size_type num_groups, size_type group_size) {
+        for (auto idx : grid_stride_range(output.size())) {
+            const index_type outer_idx = idx / inner_size;
+            const index_type c = outer_idx % C;
+            const index_type group_idx = outer_idx / group_size;
+            auto s = static_cast<float>(scale[c]) * inv_stddev[group_idx];
+            auto b = static_cast<float>(bias[c]);
+            output[idx] = (static_cast<float>(input[idx]) - means[group_idx]) * s + b;
+        }
+    }
+
    template <class T>
    __global__ void normalize_mean_variance_layernorm(Span<T> output, View<T> input, View<T> scale, View<float> means, View<float> inv_stddev, size_type inner_size) {
        for (auto idx : grid_stride_range(output.size())) {
@ -191,6 +203,24 @@ template void normalize_mean_variance_channelwise(const Stream&, Span<__half> /*
 #endif
 template void normalize_mean_variance_channelwise(const Stream&, Span<float> /*output*/, View<float> /*input*/, View<float> /*scale*/, View<float> /*bias*/, View<float> /*means*/, View<float> /*inv_stddev*/, std::size_t, std::size_t);

+template <class T>
+void normalize_mean_variance_groupwise(const Stream& stream, Span<T> output, View<T> input, View<T> scale, View<T> bias, View<float> means, View<float> inv_stddev, std::size_t inner_size, std::size_t C, std::size_t num_groups, std::size_t group_size)
+{
+    CV_Assert(input.size() == output.size());
+    CV_Assert(input.size() / inner_size == means.size() * group_size);
+    CV_Assert(means.size() == inv_stddev.size());
+
+    auto kernel = raw::normalize_mean_variance_groupwise<T>;
+    auto policy = make_policy(kernel, output.size(), 0, stream);
+    launch_kernel(kernel, policy, output, input, scale, bias, means, inv_stddev, inner_size, C, num_groups, group_size);
+}
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+template void normalize_mean_variance_groupwise(const Stream&, Span<__half> /*output*/, View<__half> /*input*/, View<__half> /*scale*/, View<__half> /*bias*/, View<float> /*means*/, View<float> /*inv_stddev*/, std::size_t, std::size_t, std::size_t, std::size_t);
+#endif
+template void normalize_mean_variance_groupwise(const Stream&, Span<float> /*output*/, View<float> /*input*/, View<float> /*scale*/, View<float> /*bias*/, View<float> /*means*/, View<float> /*inv_stddev*/, std::size_t, std::size_t, std::size_t, std::size_t);
+
+
 template <class T>
 void normalize_mean_variance_layernorm(const Stream& stream, Span<T> output, View<T> input, View<T> scale, View<float> means, View<float> inv_stddev, std::size_t inner_size)
 {
--- a/modules/dnn/src/cuda4dnn/csl/tensor.hpp
+++ b/modules/dnn/src/cuda4dnn/csl/tensor.hpp
@ -1262,6 +1262,23 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
        return true;
    }

+    template <typename ShapeType>
+    bool is_shape_compatible1(const ShapeType &x_shape, const ShapeType &y_shape) noexcept {
+        const auto x_ndims = x_shape.size(), y_ndims = y_shape.size();
+
+        if (x_ndims != y_ndims) {
+            return false;
+        }
+
+        for (int i = 0; i < x_ndims; i++) {
+            if (x_shape[i] != y_shape[i] && x_shape[i] != 1 && y_shape[i] != 1) {
+                 return false;
+            }
+        }
+
+        return true;
+    }
+
    /** returns the rank to which the given tensor can be squeezed to */
    template <class TensorType>
    std::size_t get_effective_rank(const TensorType& x) noexcept {
--- a/modules/dnn/src/cuda4dnn/kernels/eltwise_ops.hpp
+++ b/modules/dnn/src/cuda4dnn/kernels/eltwise_ops.hpp
@ -33,6 +33,12 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
    template <class T>
    void eltwise_sub_2(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> x, csl::TensorView<T> y);

+    template <class T>
+    void eltwise_mod_2(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> x, csl::TensorView<T> y);
+
+    template <class T>
+    void eltwise_fmod_2(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> x, csl::TensorView<T> y);
+
 }}}} /* namespace cv::dnn::cuda4dnn::kernels */

 #endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_ELTWISE_OPS_HPP */
--- a/modules/dnn/src/cuda4dnn/kernels/mvn.hpp
+++ b/modules/dnn/src/cuda4dnn/kernels/mvn.hpp
@ -35,6 +35,10 @@ void normalize_mean_variance_layernorm(const csl::Stream &stream, csl::Span<T> o
 template <class T>
 void normalize_mean_variance_layernorm(const csl::Stream &stream, csl::Span<T> output, csl::View<T> input, csl::View<T> scale, csl::View<T> bias, csl::View<float> means, csl::View<float> inv_stddev, std::size_t inner_size);

+template <class T>
+void normalize_mean_variance_groupwise(const csl::Stream &stream, csl::Span<T> output, csl::View<T> input, csl::View<T> scale, csl::View<T> bias, csl::View<float> means, csl::View<float> inv_stddev, std::size_t inner_size, std::size_t C, std::size_t num_groups, std::size_t group_size);
+
+
 }}}} /* namespace cv::dnn::cuda4dnn::kernels */

 #endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_MVN_HPP */
--- a/modules/dnn/src/cuda4dnn/primitives/eltwise.hpp
+++ b/modules/dnn/src/cuda4dnn/primitives/eltwise.hpp
@ -28,6 +28,8 @@ namespace cv { namespace dnn { namespace cuda4dnn {
        DIV,
        MIN,
        SUB,
+        MOD,
+        FMOD,
    };

    class EltwiseOpBase : public CUDABackendNode {
@ -90,6 +92,8 @@ namespace cv { namespace dnn { namespace cuda4dnn {
                        kernels::eltwise_sum_coeff_2<T>(stream, output, coeffs[0], input_x, coeffs[1], input_y);
                    break;
                case EltwiseOpType::SUB: kernels::eltwise_sub_2<T>(stream, output, input_x, input_y); break;
+                case EltwiseOpType::MOD: kernels::eltwise_mod_2<T>(stream, output, input_x, input_y); break;
+                case EltwiseOpType::FMOD: kernels::eltwise_fmod_2<T>(stream, output, input_x, input_y); break;
                }
            }
            else
@ -122,6 +126,8 @@ namespace cv { namespace dnn { namespace cuda4dnn {
                        }
                        break;
                    case EltwiseOpType::SUB: kernels::eltwise_sub_2<T>(stream, output, output, input); break;
+                    case EltwiseOpType::MOD: kernels::eltwise_mod_2<T>(stream, output, output, input); break;
+                    case EltwiseOpType::FMOD: kernels::eltwise_fmod_2<T>(stream, output, output, input); break;
                    }
                }
            }
--- a/modules/dnn/src/cuda4dnn/primitives/group_norm.hpp
+++ b/modules/dnn/src/cuda4dnn/primitives/group_norm.hpp
@ -0,0 +1,87 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_GROUP_NORM_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_GROUP_NORM_HPP
+
+#include "../../op_cuda.hpp"
+
+#include "../csl/stream.hpp"
+#include "../csl/span.hpp"
+#include "../csl/tensor.hpp"
+#include "../csl/workspace.hpp"
+
+#include "../kernels/fill_copy.hpp"
+#include "../kernels/mvn.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cstddef>
+#include <vector>
+#include <utility>
+
+namespace cv { namespace dnn { namespace cuda4dnn {
+
+    template <class T>
+    class GroupNormOp final : public CUDABackendNode {
+    public:
+        using wrapper_type = GetCUDABackendWrapperType<T>;
+
+        GroupNormOp(csl::Stream stream_, float epsilon_, size_t loops, size_t num_groups)
+            : stream(std::move(stream_)), epsilon(epsilon_), num_groups(num_groups) {
+            csl::WorkspaceBuilder builder;
+            builder.require<float>(loops * num_groups); // mean and stdev for each group
+            builder.require<float>(loops * num_groups);
+            scratch_mem_in_bytes = builder.required_workspace_size();
+        }
+
+        void forward(const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+                     const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+                     csl::Workspace& workspace) override {
+            auto input_wrapper = inputs[0].dynamicCast<wrapper_type>();
+            auto scale_wrapper = inputs[1].dynamicCast<wrapper_type>();
+            auto bias_wrapper = inputs[2].dynamicCast<wrapper_type>();
+
+            auto input = input_wrapper->getView();
+            auto scale = scale_wrapper->getView();
+            auto bias = bias_wrapper->getView();
+
+            auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
+            auto output = output_wrapper->getSpan();
+
+            auto C = input.get_axis_size(1);
+            auto loops = input.size_range(0, 2);
+            auto norm_size = input.size_range(2, input.rank());
+            auto num_groups = this->num_groups;
+            auto group_size = C / num_groups;
+            if (norm_size == 1) {
+                kernels::fill<T>(stream, output, 0.f);
+                return;
+            } else {
+                auto ws_allocator = csl::WorkspaceAllocator(workspace);
+
+                auto mean = ws_allocator.get_span<float>(loops / group_size);
+                kernels::fill<float>(stream, mean, 0.f);
+
+                auto stdev = ws_allocator.get_span<float>(loops / group_size);
+                kernels::fill<float>(stream, stdev, 0.f);
+
+                kernels::reduce_mean_sqr_sum<T>(stream, mean, stdev, input, norm_size * group_size);
+                kernels::compute_normalization_scale(stream, stdev, mean, stdev, norm_size * group_size, epsilon);
+                kernels::normalize_mean_variance_groupwise<T>(stream, output, input, scale, bias, mean, stdev, norm_size, C, num_groups, group_size);
+            }
+        }
+
+        std::size_t get_workspace_memory_in_bytes() const noexcept override { return scratch_mem_in_bytes; }
+
+    private:
+        csl::Stream stream;
+        float epsilon;
+        std::size_t num_groups;
+        std::size_t scratch_mem_in_bytes;
+    };
+
+}}} // cv::dnn::cuda4dnn
+
+#endif // OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_GROUP_NORM_HPP
--- a/modules/dnn/src/init.cpp
+++ b/modules/dnn/src/init.cpp
@ -163,6 +163,7 @@ void initializeLayerFactory()
    CV_DNN_REGISTER_LAYER_CLASS(Expand,         ExpandLayer);
    CV_DNN_REGISTER_LAYER_CLASS(InstanceNormalization, InstanceNormLayer);
    CV_DNN_REGISTER_LAYER_CLASS(Attention,      AttentionLayer);
+    CV_DNN_REGISTER_LAYER_CLASS(GroupNormalization, GroupNormLayer);

    CV_DNN_REGISTER_LAYER_CLASS(Crop,           CropLayer);
    CV_DNN_REGISTER_LAYER_CLASS(Eltwise,        EltwiseLayer);
--- a/modules/dnn/src/int8layers/convolution_layer.cpp
+++ b/modules/dnn/src/int8layers/convolution_layer.cpp
@ -969,6 +969,13 @@ public:
                                    stride_h, stride_w, dilation_h, dilation_w, pad_t, pad_l,
                                    biasptr, multptr, inptr_, height, width, outptr_, out_d, outH, outW, inpZp, outZp);
                            else
+                        #endif
+                        #if CV_RVP052
+                            if(isConv2D)
+                                opt_RVP052::fastDepthwiseConv(wptr, kernel_h, kernel_w,
+                                    stride_h, stride_w, dilation_h, dilation_w, pad_t, pad_l,
+                                    biasptr, multptr, inptr_, height, width, outptr_, out_d, outH, outW, inpZp, outZp);
+                            else
                        #endif
                            {
                                const int8_t w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],
@ -1348,6 +1355,12 @@ public:
                            opt_LASX::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0,
                                          outShape, bsz, vsz, vsz_a, outZp, multptr, cn0 == 0, cn1 == inpCn);
                        else
+                    #endif
+                    #if CV_RVP052
+                        if(isConv2D)
+                            opt_RVP052::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0,
+                                          outShape, bsz, vsz, vsz_a, outZp, multptr, cn0 == 0, cn1 == inpCn);
+                        else
                    #endif
                        for( int i = 0; i < outCn; i += 2 )
                        {
--- a/modules/dnn/src/int8layers/fully_connected_layer.cpp
+++ b/modules/dnn/src/int8layers/fully_connected_layer.cpp
@ -302,6 +302,11 @@ public:
                if( useLASX )
                    opt_LASX::fastGEMM1T( sptr, wptr, wstep, biasptr, multptr, dptr, nw, vecsize, outZp );
                else
+            #endif
+            #if CV_RVP052
+                if( 1 )
+                    opt_RVP052::fastGEMM1T( sptr, wptr, wstep, biasptr, multptr, dptr, nw, vecsize, outZp );
+                else
            #endif
                {
                    int i = 0;
--- a/modules/dnn/src/int8layers/layers_common.hpp
+++ b/modules/dnn/src/int8layers/layers_common.hpp
@ -13,6 +13,8 @@
 #include "int8layers/layers_common.simd_declarations.hpp"
 #undef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY

+#include "./layers_rvp052.hpp"
+
 #ifdef HAVE_OPENCL
 #include "../ocl4dnn/include/ocl4dnn.hpp"
 #endif
--- a/modules/dnn/src/int8layers/layers_rvp052.cpp
+++ b/modules/dnn/src/int8layers/layers_rvp052.cpp
@ -0,0 +1,221 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "../precomp.hpp"
+#include "./layers_rvp052.hpp"
+
+#if CV_RVP052
+
+namespace cv {
+namespace dnn {
+namespace opt_RVP052 {
+
+void fastConv(const int8_t *weights, size_t wstep, const int *bias,
+              const int8_t *rowbuf, int *output, const int *outShape,
+              int blockSize, int vecsize, int vecsize_aligned, int outZp,
+              const float *multiplier, bool initOutput, bool finalOutput)
+{
+    int outCn = outShape[1];
+    size_t outPlaneSize = outShape[2] * outShape[3];
+    for (int i = 0; i < outCn; i += 2)
+    {
+        const int8_t *wptr0 = weights + i * wstep;
+        const int8_t *wptr1 = wptr0 + wstep;
+        int *outptr0 = output + i * outPlaneSize;
+        int *outptr1 = outptr0 + outPlaneSize;
+        int bias0 = bias[i], bias1 = bias[i + 1];
+        float mult0 = multiplier[i], mult1 = multiplier[i + 1];
+
+        if (i + 1 >= outCn)
+        {
+            wptr1 = wptr0;
+            outptr1 = outptr0;
+            bias1 = bias0;
+            mult1 = mult0;
+        }
+        int j = 0;
+        for (; j < blockSize; j++)
+        {
+            const int8_t *rptr = rowbuf + j * vecsize_aligned;
+            int s00 = initOutput ? bias0 : outptr0[j];
+            int s10 = initOutput ? bias1 : outptr1[j];
+
+            int32x2_t vsx0 = {s00, s10};
+
+            for (int k = 0; k < vecsize; k += 4)
+            {
+                int8x4_t vrptr[2] = {*(int8x4_t*)(rptr + k), *(int8x4_t*)(rptr + k)};
+                int8x4_t vwptr[2] = {*(int8x4_t*)(wptr0 + k), *(int8x4_t*)(wptr1 + k)};
+                vsx0 = __nds__v_smaqa(vsx0, *(int8x8_t*)vwptr, *(int8x8_t*)vrptr);
+            }
+
+            if (finalOutput)
+            {
+                vsx0[0] = outZp + (int)std::round(vsx0[0] * mult0);
+                vsx0[1] = outZp + (int)std::round(vsx0[1] * mult1);
+                vsx0 = __nds__v_sclip32(vsx0, 7);
+            }
+
+            outptr0[j] = vsx0[0];
+            outptr1[j] = vsx0[1];
+        }
+    }
+}
+
+void fastDepthwiseConv(const int8_t *wptr,
+                       int kernel_h, int kernel_w,
+                       int stride_h, int stride_w,
+                       int dilation_h, int dilation_w,
+                       int pad_t, int pad_l,
+                       const int *biasptr, const float *multptr,
+                       const int8_t *inptr_,
+                       int height, int width,
+                       int *outptr_,
+                       int out_d, int outH, int outW,
+                       int inpZp, int outZp)
+{
+    const int8_t w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],
+                 w10 = wptr[3], w11 = wptr[4], w12 = wptr[5],
+                 w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8];
+    int outW1 = min(outW, (width - dilation_w * (kernel_w - 1) + pad_l) / stride_w);
+    int bias = biasptr[out_d], biasCopy;
+    float mult = multptr[out_d];
+
+    for (int out_i = 0; out_i < outH; out_i++)
+    {
+        int in_i = out_i * stride_h - pad_t, out_j = 0;
+        const int8_t *imgptr0 = inptr_ + in_i * width;
+        const int8_t *imgptr1 = imgptr0 + dilation_h * width;
+        const int8_t *imgptr2 = imgptr0 + (dilation_h * 2) * width;
+        int8_t w00 = w00_, w01 = w01_, w02 = w02_;
+        int8_t w20 = w20_, w21 = w21_, w22 = w22_;
+        int out;
+        biasCopy = bias;
+
+        if (in_i < 0)
+        {
+            biasCopy += inpZp * (w00 + w01 + w02);
+            w00 = w01 = w02 = 0;
+            imgptr0 = imgptr1;
+        }
+        else if (in_i + dilation_h * (kernel_h - 1) >= height)
+        {
+            biasCopy += inpZp * (w20 + w21 + w22);
+            w20 = w21 = w22 = 0;
+            imgptr2 = imgptr1;
+        }
+        int *outptr = outptr_ + out_i * outW;
+        if (pad_l > 0)
+        {
+            out = (int)imgptr0[0] * w01 + (int)imgptr0[dilation_w] * w02 +
+                  (int)imgptr1[0] * w11 + (int)imgptr1[dilation_w] * w12 +
+                  (int)imgptr2[0] * w21 + (int)imgptr2[dilation_w] * w22 +
+                  biasCopy + inpZp * (w00 + w10 + w20);
+            outptr[0] = __nds__sclip32(outZp + (int)std::round(out * mult), 7);
+            out_j = 1;
+        }
+
+        int8x8_t vwx0 = (int8x8_t){w00, w10, w20, 0, w00, w10, w20, 0};
+        int8x8_t vwx1 = (int8x8_t){w01, w11, w21, 0, w01, w11, w21, 0};
+        int8x8_t vwx2 = (int8x8_t){w02, w12, w22, 0, w02, w12, w22, 0};
+        int8x8_t vimgx0, vimgx1, vimgx2;
+        int32x2_t vout = {0, 0};
+        for (; out_j < outW1; out_j+=2)
+        {
+            int in_j = out_j * stride_w - pad_l;
+            vimgx0 = (int8x8_t){imgptr0[in_j], imgptr1[in_j], imgptr2[in_j], 0,
+                                imgptr0[in_j + stride_w], imgptr1[in_j + stride_w], imgptr2[in_j + stride_w], 0};
+            vimgx1 = (int8x8_t){imgptr0[in_j + dilation_w], imgptr1[in_j + dilation_w], imgptr2[in_j + dilation_w], 0,
+                                imgptr0[in_j + dilation_w + stride_w], imgptr1[in_j + dilation_w + stride_w], imgptr2[in_j + dilation_w + stride_w], 0};
+            vimgx2 = (int8x8_t){imgptr0[in_j + dilation_w * 2], imgptr1[in_j + dilation_w * 2], imgptr2[in_j + dilation_w * 2], 0,
+                                imgptr0[in_j + dilation_w * 2 + stride_w], imgptr1[in_j + dilation_w * 2 + stride_w], imgptr2[in_j + dilation_w * 2 + stride_w], 0};
+
+            vout = (int32x2_t){biasCopy, biasCopy};
+            vout = __nds__v_smaqa(vout, vwx0, vimgx0);
+            vout = __nds__v_smaqa(vout, vwx1, vimgx1);
+            vout = __nds__v_smaqa(vout, vwx2, vimgx2);
+
+            outptr[out_j] = __nds__sclip32(outZp + (int)std::round(vout[0] * mult), 7);
+            outptr[out_j + 1] = __nds__sclip32(outZp + (int)std::round(vout[1] * mult), 7);
+        }
+
+        while (out_j > outW1) out_j--;
+
+        for (; out_j < outW; out_j++)
+        {
+            int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w * 2;
+            int s0 = 1, s1 = 1, s2 = 1;
+            if (in_j0 >= width)
+            {
+                in_j0 = 0;
+                s0 = 0;
+                biasCopy += inpZp * (w00 + w10 + w20);
+            }
+            if (in_j1 >= width)
+            {
+                in_j1 = 0;
+                s1 = 0;
+                biasCopy += inpZp * (w01 + w11 + w21);
+            }
+            if (in_j2 >= width)
+            {
+                in_j2 = 0;
+                s2 = 0;
+                biasCopy += inpZp * (w02 + w12 + w22);
+            }
+            out = (int)imgptr0[in_j0] * w00 * s0 + (int)imgptr0[in_j1] * w01 * s1 + (int)imgptr0[in_j2] * w02 * s2 +
+                  (int)imgptr1[in_j0] * w10 * s0 + (int)imgptr1[in_j1] * w11 * s1 + (int)imgptr1[in_j2] * w12 * s2 +
+                  (int)imgptr2[in_j0] * w20 * s0 + (int)imgptr2[in_j1] * w21 * s1 + (int)imgptr2[in_j2] * w22 * s2 + biasCopy;
+            outptr[out_j] = __nds__sclip32(outZp + (int)std::round(out * mult), 7);
+        }
+    }
+}
+
+// dst = vec * weights^t + bias
+void fastGEMM1T( const int8_t* vec, const int8_t* weights,
+                 size_t wstep, const int* bias, const float* multiplier,
+                 int* dst, int nvecs, int vecsize, int outZp )
+{
+    int i = 0;
+
+    for( ; i <= nvecs - 2; i += 2 )
+    {
+        const int8_t* wptr0 = weights + i * wstep;
+        const int8_t* wptr1 = weights + (i + 1) * wstep;
+
+        int32x2_t vs0 = *(int32x2_t*)(bias + i);
+
+        for( int k = 0; k < vecsize; k += 4 )
+        {
+            int8x4_t vvec[2] = {*(int8x4_t*)(vec + k), *(int8x4_t*)(vec + k)};
+            int8x4_t vwptr[2] = {*(int8x4_t*)(wptr0 + k), *(int8x4_t*)(wptr1 + k)};
+            vs0 = __nds__v_smaqa(vs0, *(int8x8_t*)vwptr, *(int8x8_t*)vvec);
+        }
+
+        int32x2_t vdst = {(int)std::round(vs0[0] * multiplier[i]), (int)std::round(vs0[1] * multiplier[i + 1])};
+
+        vdst = __nds__v_sclip32(vdst + outZp, 7);
+
+        *(int32x2_t*)(dst + i) = vdst;
+    }
+
+    for( ; i < nvecs; i++ )
+    {
+        const int8_t* wptr = weights + i * wstep;
+        int s0 = bias[i];
+
+        for( int k = 0; k < vecsize; k += 4 )
+        {
+            int8x4_t vvec[2] = {*(int8x4_t*)(vec + k), 0};
+            int8x4_t vwptr[2] = {*(int8x4_t*)(wptr + k), 0};
+            s0 = __nds__smaqa(s0, *(unsigned long*)vwptr, *(unsigned long*)vvec);
+        }
+
+        dst[i] = __nds__sclip32(outZp + (int)std::round(s0 * multiplier[i]), 7);
+    }
+}
+
+}}} // namespace
+
+#endif
--- a/modules/dnn/src/int8layers/layers_rvp052.hpp
+++ b/modules/dnn/src/int8layers/layers_rvp052.hpp
@ -0,0 +1,36 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#if defined(__riscv) && defined(__riscv_dsp) && defined(__ANDES)
+# include <nds_intrinsic.h>
+# define CV_RVP052 1
+
+namespace cv {
+namespace dnn {
+namespace opt_RVP052 {
+
+void fastConv( const int8_t* weights, size_t wstep, const int* bias,
+               const int8_t* rowbuf, int* output, const int* outShape,
+               int blockSize, int vecsize, int vecsize_aligned, int outZp,
+               const float* multiplier, bool initOutput, bool finalOutput );
+void fastDepthwiseConv( const int8_t* wptr,
+                        int kernel_h, int kernel_w,
+                        int stride_h, int stride_w,
+                        int dilation_h, int dilation_w,
+                        int pad_t, int pad_l,
+                        const int* biasptr, const float* multptr,
+                        const int8_t* inptr_,
+                        int height, int width,
+                        int* outptr_,
+                        int out_d, int outH, int outW,
+                        int inpZp, int outZp );
+void fastGEMM1T( const int8_t* vec, const int8_t* weights,
+                 size_t wstep, const int* bias, const float* multiplier,
+                 int* dst, int nvecs, int vecsize, int outZp );
+
+}}}
+
+#else
+# define CV_RVP052 0
+#endif
--- a/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.cpp
+++ b/modules/dnn/src/layers/cpu_kernels/conv_winograd_f63.cpp
@ -338,7 +338,7 @@ int runWinograd63(InputArray _input, InputArray _fusedAddMat, OutputArray _outpu
                        }
 #if CV_TRY_AVX2
                        if (conv->useAVX2)
-                            opt_AVX::winofunc_AtXA_8x8_F32((float *)out_wbuf + ((k - k0)*CONV_WINO_IBLOCK + (block_id - block_id0))*CONV_WINO_AREA, CONV_WINO_SIZE,
+                            opt_AVX2::winofunc_AtXA_8x8_F32((float *)out_wbuf + ((k - k0)*CONV_WINO_IBLOCK + (block_id - block_id0))*CONV_WINO_AREA, CONV_WINO_SIZE,
                                                                bpptr, outstep, outptr, outstep, biasv, minval, maxval, ifMinMaxAct);
                        else
 #endif
--- a/modules/dnn/src/layers/cpu_kernels/fast_gemm.cpp
+++ b/modules/dnn/src/layers/cpu_kernels/fast_gemm.cpp
@ -385,7 +385,7 @@ void fastGemmBatch(bool trans_a, bool trans_b,
    const auto shape_b = shape(B);
    const auto shape_c = shape(C);
    CV_CheckGE(shape_a.size(), static_cast<size_t>(2), "DNN/fastGemmBatch: A must be n-dimensional (n >= 2)");
-    CV_CheckEQ(shape_b.size(), static_cast<size_t>(2), "DNN/fastGemmBatch: B must be n-dimensional (n >= 2)");
+    CV_CheckGE(shape_b.size(), static_cast<size_t>(2), "DNN/fastGemmBatch: B must be n-dimensional (n >= 2)");

    const float *a = A.ptr<const float>();
    const float *b = B.ptr<const float>();
--- a/modules/dnn/src/layers/cpu_kernels/fast_norm.cpp
+++ b/modules/dnn/src/layers/cpu_kernels/fast_norm.cpp
@ -158,4 +158,51 @@ void fastNormChannel(const Mat &input, const Mat &scale, const Mat &bias, Mat &o
    parallel_for_(Range(0, loops), fn, nstripes);
 }

+void fastNormGroup(const Mat &input, const Mat &scale, const Mat &bias, Mat &output, float epsilon, size_t num_groups) {
+    const auto input_shape = shape(input);
+    size_t N = input_shape[0], C = input_shape[1];
+    CV_CheckEQ(scale.total(), bias.total(), "fastNormGroup: scale and bias should have the same shape");
+    CV_CheckEQ(scale.total(), C, "fastNormGroup: scale should be a 1d tensor and match the channel of input");
+    CV_CheckGE(input.dims, 3, "fastNormGroup: input dimension >= 3");
+
+    size_t channels_per_group = C / num_groups;
+    size_t loops = N * num_groups;
+    size_t norm_size = static_cast<size_t>(total(input_shape, 2) * channels_per_group);
+    size_t step = norm_size / channels_per_group;
+    float inv_norm_size = 1.0 / norm_size;
+
+    auto fn = [&](const Range &r) {
+        const auto *input_data = input.ptr<const float>();
+        const auto *scale_data = scale.ptr<const float>();
+        const auto *bias_data = bias.ptr<const float>();
+        auto *output_data = output.ptr<float>();
+
+        for (int i = r.start; i < r.end; i++) {
+            const auto *x = input_data + norm_size * i;
+            auto *y = output_data + norm_size * i;
+
+            float mean = 0.f, mean_square = 0.f;
+            for (int j = 0; j < norm_size; j++) {
+                float v = x[j];
+                mean += v;
+                mean_square += v * v;
+            }
+
+            mean *= inv_norm_size;
+            mean_square = std::sqrt(std::max(0.f, mean_square * inv_norm_size - mean * mean) + epsilon);
+            float inv_stdev = 1.f / mean_square;
+
+            size_t group_idx = i % num_groups * channels_per_group;
+            for (size_t j = 0; j < norm_size; j++) {
+                size_t c = group_idx + (j / step);
+                float s = scale_data[c] * inv_stdev, b = bias_data[c];
+                y[j] = s * (x[j] - mean) + b;
+            }
+        }
+    };
+
+    double nstripes = loops * norm_size * (1 / 1024.0);
+    parallel_for_(Range(0, loops), fn, nstripes);
+}
+
 }} // cv::dnn
--- a/modules/dnn/src/layers/cpu_kernels/fast_norm.hpp
+++ b/modules/dnn/src/layers/cpu_kernels/fast_norm.hpp
@ -21,6 +21,9 @@ void fastNorm(const Mat &input, const Mat &scale, const Mat &bias, Mat &output,
 // Channel-wise Normalization speedup by multi-threading. Scale and bias should have the same shape (C). Input should have dimension >= 3.
 void fastNormChannel(const Mat &input, const Mat &scale, const Mat &bias, Mat &output, float epsilon);

+// Group-wise Normalization speedup by multi-threading. Scale and bias should have the same shape (C). Input should have dimension >= 3.
+void fastNormGroup(const Mat &input, const Mat &scale, const Mat &bias, Mat &output, float epsilon, size_t num_groups);
+
 }} // cv::dnn

 #endif // OPENCV_DNN_FAST_NORM_HPP
--- a/modules/dnn/src/layers/einsum_layer.cpp
+++ b/modules/dnn/src/layers/einsum_layer.cpp
@ -1299,7 +1299,6 @@ Mat LayerEinsumImpl::batchwiseMatMul(
    const Mat& input2,
    const MatShape& input2ShapeOverride)
 {
-
    // Sanity checks before the actual MatMul
    CV_CheckType(input1.type(), input2.type(), "Data types of the inputs must match for MatMul");
    CV_CheckEQ(input1ShapeOverride.size(), (size_t) 3, "Only 1 batch dimension is allowed for MatMul");
@ -1312,59 +1311,21 @@ Mat LayerEinsumImpl::batchwiseMatMul(
    int K = input1ShapeOverride[2];
    int N = input2ShapeOverride[2];

-    std::vector<Mat> output;
+    Mat reshapedInput1 = input1;
+    Mat reshapedInput2 = input2;
+
+    Mat output;
    if (batches > 1)
    {
-        Mat reshapedInput1 = input1;
-        Mat reshapedInput2 = input2;
+        // create tmpout with type like input1
+        output = Mat({batches, M, N}, input1.type());

-        // input1 should of size MxK
-        // check if input1 needs reshape, if need reshape
-        if (input1.size[0] != M || input1.size[1] != K)
-        {
-            int shape[] = {batches, M, K};
-            reshapedInput1 = input1.reshape(1, 3, shape);
-        }
-
-        // input2 should be of size KxN
-        // check if input2 needs reshape, if needs reshape
-        if (input2.size[0] != K || input2.size[1] != N)
-        {
-            int shape[] = {batches, K, N};
-            reshapedInput2 = input2.reshape(1, 3, shape);
-        }
-
-        for (size_t i=0; i < batches; i++)
-        {
-            std::vector<Range> ranges1 = {cv::Range(i, i+1)};
-            for (int j = 1; j < reshapedInput1.dims; j++)
-                ranges1.emplace_back(cv::Range::all());
-
-            Mat part1 = reshapedInput1(ranges1);
-            int shape[] = {M, K};
-            part1 = part1.reshape(1, sizeof(shape)/sizeof(shape[0]), shape);
-
-            std::vector<Range> ranges2 = {cv::Range(i, i+1)};
-            for (int j = 1; j < reshapedInput2.dims; j++)
-                ranges2.emplace_back(cv::Range::all());
-
-            Mat part2 = reshapedInput2(ranges2);
-            int shape2[] = {K, N};
-            part2 = part2.reshape(1, sizeof(shape2)/sizeof(shape2[0]), shape2);
-
-            Mat tmp_output(M, N, part1.type());
-            fastGemm(false, false, 1.0, part1, part2, 0.0, tmp_output, opt);
-            int newShape[] = {1, M, N};
-            tmp_output = tmp_output.reshape(1, sizeof(newShape)/sizeof(newShape[0]), newShape);
-
-            output.emplace_back(tmp_output);
-        }
+        reshapedInput2 = reshapedInput2.reshape(1, input2ShapeOverride);
+        reshapedInput1 = reshapedInput1.reshape(1, input1ShapeOverride);

+        fastGemmBatch(false, false, 1.0, reshapedInput1, reshapedInput2, 0.0, output, opt);
    } else {

-        Mat reshapedInput1 = input1;
-        Mat reshapedInput2 = input2;
-
        // input1 should of size MxK
        // check if input1 needs reshape, if need reshape
        if (input1.dims > 2 || input1.size[0] != M || (input1.dims > 1 && input1.size[1] != K) || input1.dims == 1)
@ -1381,23 +1342,12 @@ Mat LayerEinsumImpl::batchwiseMatMul(
            reshapedInput2 = input2.reshape(1, 2, shape2);
        }

-        Mat tmp_output(M, N, reshapedInput1.type());
-        fastGemm(false, false, 1.0, reshapedInput1, reshapedInput2, 0.0, tmp_output, opt);
-
-        int newShape[] = {1, M, N};
-        tmp_output = tmp_output.reshape(1, sizeof(newShape)/sizeof(newShape[0]), newShape);
-        output.emplace_back(tmp_output);
+        output = Mat(M, N, reshapedInput1.type());
+        fastGemm(false, false, 1.0, reshapedInput1, reshapedInput2, 0.0, output, opt);

+        output = output.reshape(1, {1, M, N});
    }
-
-    int outputDim[] = {static_cast<int>(output.size()), M, N};
-    Mat output_buffer = Mat::zeros(3, outputDim, CV_32F);
-
-    for (size_t i = 0; i < output.size(); i++) {
-        Mat output_slice = output_buffer.row(i);
-        output[i].copyTo(output_slice);
-    }
-    return output_buffer;
+    return output;
 };
 Ptr<EinsumLayer> EinsumLayer::create(const LayerParams& params)
 {
--- a/modules/dnn/src/layers/fully_connected_layer.cpp
+++ b/modules/dnn/src/layers/fully_connected_layer.cpp
@ -453,13 +453,6 @@ public:
                ret = false;
                break;
            }
-
-            if (!use_half && bias && (outerSize > 1))
-            {
-                UMat biasOnesMat = UMat::ones(outerSize, 1, umat_blobs[0].type());
-                UMat& biases = umat_blobs[1];
-                cv::gemm(biasOnesMat, biases, 1, dstMat, 1, dstMat, 0);
-            }
        }

        if (ret) return true;
--- a/modules/dnn/src/layers/group_norm_layer.cpp
+++ b/modules/dnn/src/layers/group_norm_layer.cpp
@ -0,0 +1,190 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "../precomp.hpp"
+#include <opencv2/dnn/shape_utils.hpp>
+#include "./cpu_kernels/fast_norm.hpp"
+
+// CUDA backend
+#include "../op_cuda.hpp"
+#ifdef HAVE_CUDA
+#include "../cuda4dnn/primitives/group_norm.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
+// OpenCL backend
+#ifdef HAVE_OPENCL
+#include "../ocl4dnn/include/math_functions.hpp"
+#include "opencl_kernels_dnn.hpp"
+#endif
+
+namespace cv {
+namespace dnn {
+
+// https://github.com/onnx/onnx/blob/main/docs/Operators.md#GroupNormalization
+class GroupNormLayerImpl CV_FINAL : public GroupNormLayer {
+public:
+    GroupNormLayerImpl(const LayerParams &params) {
+        setParamsFrom(params);
+
+        epsilon = params.get<float>("epsilon", 1e-5);
+        num_groups = params.get<int>("num_groups");
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE {
+        return backendId == DNN_BACKEND_OPENCV ||
+               backendId == DNN_BACKEND_CUDA;
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE {
+        const auto &input = inputs[0];
+        const auto &scale = inputs[1];
+        const auto &bias = inputs[2];
+        CV_CheckGE(input.size(), static_cast<size_t>(3), "DNN/GroupNorm: input dimension >= 3 is required");
+
+        int C = input[1];
+        int scale_dim = std::accumulate(scale.begin(), scale.end(), 1, std::multiplies<int>());
+        CV_CheckEQ(scale_dim, C, "DNN/InstanceNorm: scale must be a 1d tensor and match the channel of input");
+        int bias_dim = std::accumulate(bias.begin(), bias.end(), 1, std::multiplies<int>());
+        CV_CheckEQ(bias_dim, C, "DNN/InstanceNorm: bias must be a 1d tensor and match the channel of input");
+
+        outputs.assign(1, inputs[0]);
+        return false;
+    }
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        if (inputs_arr.depth() == CV_16S) {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
+
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        const auto& input = inputs[0];
+        const auto& scale = inputs[1];
+        const auto& bias = inputs[2];
+
+        fastNormGroup(input, scale, bias, outputs[0], epsilon, num_groups);
+    }
+
+#ifdef HAVE_OPENCL
+    bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_) {
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
+
+        inputs_.getUMatVector(inputs);
+        outputs_.getUMatVector(outputs);
+
+        const auto &input = inputs[0], &scale = inputs[1], &bias = inputs[2];
+        auto &output = outputs[0];
+
+        const auto input_shape = shape(input);
+        size_t N = input_shape[0], C = input_shape[1];
+        size_t num_groups = this->num_groups;
+        size_t channels_per_group = C / num_groups;
+        size_t loops = N * num_groups, norm_size = static_cast<size_t>(total(input_shape, 2)) * channels_per_group;
+        float inv_norm_size = 1.f / norm_size;
+
+        // no fp16 support
+        if (input.depth() == CV_16S) {
+            return false;
+        }
+
+        String base_opts = format(" -DT=float -DT4=float4 -Dconvert_T=convert_float4");
+
+        // Calculate mean
+        UMat one = UMat::ones(norm_size, 1, CV_32F);
+        UMat mean = UMat(loops, 1, CV_32F);
+        UMat mean_square = UMat(loops, 1, CV_32F);
+        UMat tmp = UMat(loops, norm_size, CV_32F);
+        bool ret = ocl4dnn::ocl4dnnGEMV<float>(ocl4dnn::CblasNoTrans, loops, norm_size, inv_norm_size,
+                                               input, 0, one, 0, 0.f, mean, 0);
+        if (!ret) {
+            return false;
+        }
+        // Calculate mean_square
+        int num_vector = (norm_size % 8 == 0) ? 8 : ((norm_size % 4 == 0) ? 4 : 1);
+        size_t global[] = {loops, static_cast<size_t>(norm_size / num_vector)};
+        String build_opt = format(" -DNUM=%d", num_vector) + base_opts;
+        String mean_square_kernel_name = format("calc_mean%d", num_vector);
+        ocl::Kernel mean_square_kernel(mean_square_kernel_name.c_str(), ocl::dnn::mvn_oclsrc, build_opt + " -DKERNEL_MEAN");
+        if (mean_square_kernel.empty()) {
+            return false;
+        }
+        mean_square_kernel.set(0, ocl::KernelArg::PtrReadOnly(input));
+        mean_square_kernel.set(1, (int)loops);
+        mean_square_kernel.set(2, (int)norm_size);
+        mean_square_kernel.set(3, ocl::KernelArg::PtrReadOnly(mean));
+        mean_square_kernel.set(4, ocl::KernelArg::PtrWriteOnly(tmp));
+        ret = mean_square_kernel.run(2, global, NULL, false);
+        if (!ret) {
+            return false;
+        }
+        ret = ocl4dnn::ocl4dnnGEMV<float>(ocl4dnn::CblasNoTrans, loops, norm_size, inv_norm_size,
+                                          tmp, 0, one, 0, 0.f, mean_square, 0);
+        if (!ret) {
+            return false;
+        }
+        // Calculate group norm: output = scale * (x - mean) / sqrt(var + eps) + bias
+        String mvn_group_kernel_name = format("mvn_group%d", num_vector);
+        build_opt += " -DNORM_VARIANCE -DKERNEL_MVN_GROUP";
+        ocl::Kernel mvn_group_kernel(mvn_group_kernel_name.c_str(), ocl::dnn::mvn_oclsrc, build_opt);
+        if (mvn_group_kernel.empty()) {
+            return false;
+        }
+        mvn_group_kernel.set(0, ocl::KernelArg::PtrReadOnly(input));
+        mvn_group_kernel.set(1, (int)loops);
+        mvn_group_kernel.set(2, (int)norm_size);
+        mvn_group_kernel.set(3, (float)epsilon);
+        mvn_group_kernel.set(4, ocl::KernelArg::PtrReadOnly(mean));
+        mvn_group_kernel.set(5, ocl::KernelArg::PtrReadOnly(mean_square));
+        mvn_group_kernel.set(6, ocl::KernelArg::PtrReadOnly(scale));
+        mvn_group_kernel.set(7, ocl::KernelArg::PtrReadOnly(bias));
+        mvn_group_kernel.set(8, (int)C);
+        mvn_group_kernel.set(9, (int)num_groups);
+        mvn_group_kernel.set(10, (float)0.f);
+        mvn_group_kernel.set(11, ocl::KernelArg::PtrWriteOnly(output));
+        ret = mvn_group_kernel.run(2, global, NULL, false);
+        if (!ret) {
+            return false;
+        }
+
+        return true;
+        }
+#endif
+
+#ifdef HAVE_CUDA
+    Ptr<BackendNode> initCUDA(void *context_,
+                          const std::vector<Ptr<BackendWrapper>>& inputs,
+                          const std::vector<Ptr<BackendWrapper>>& outputs) override {
+    auto context = reinterpret_cast<csl::CSLContext*>(context_);
+
+    auto input_wrapper = inputs[0].dynamicCast<CUDABackendWrapper>();
+    auto input_shape = input_wrapper->getShape();
+    size_t N = input_shape[0];
+    size_t num_groups = this->num_groups;
+    size_t loops = N * num_groups;
+
+    return make_cuda_node<cuda4dnn::GroupNormOp>(preferableTarget, std::move(context->stream), epsilon, loops, num_groups);
+}
+#endif // HAVE_CUDA
+
+private:
+    float epsilon;
+    size_t num_groups;
+};
+
+Ptr<GroupNormLayer> GroupNormLayer::create(const LayerParams &params) {
+    return Ptr<GroupNormLayer>(new GroupNormLayerImpl(params));
+}
+
+}} // cv::dnn
--- a/modules/dnn/src/layers/nary_eltwise_layers.cpp
+++ b/modules/dnn/src/layers/nary_eltwise_layers.cpp
@ -24,6 +24,16 @@ namespace cv
 namespace dnn
 {

+namespace {
+static int _mod(int x, int y) {
+    int res = x % y;
+    if ((res < 0 && y > 0) || (res > 0 && y < 0)) {
+        res += y;
+    }
+    return res;
+}
+}
+
 class NaryEltwiseLayerImpl CV_FINAL : public NaryEltwiseLayer
 {
 public:
@ -42,7 +52,8 @@ public:
        MAX,
        MEAN,
        MIN,
-        MOD,
+        MOD,  // Integer Mod. Reminder's sign = Divisor's sign.
+        FMOD, // Floating-point Mod. Reminder's sign = Dividend's sign.
        PROD,
        SUB,
        SUM,
@ -79,6 +90,8 @@ public:
            op = OPERATION::MIN;
        else if (operation == "mod")
            op = OPERATION::MOD;
+        else if (operation == "fmod")
+            op = OPERATION::FMOD;
        else if (operation == "mul")
            op = OPERATION::PROD;
        else if (operation == "sub")
@ -106,18 +119,21 @@ public:
 #ifdef HAVE_CANN
        if (backendId == DNN_BACKEND_CANN)
            return op == OPERATION::ADD || op == OPERATION::PROD || op == OPERATION::SUB ||
-                   op == OPERATION::DIV || op == OPERATION::MAX  || op == OPERATION::MIN;
+                   op == OPERATION::DIV || op == OPERATION::MAX  || op == OPERATION::MIN ||
+                   op == OPERATION::MOD || op == OPERATION::FMOD;
 #endif
        if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
            return (op == OPERATION::ADD ||
                    op == OPERATION::PROD ||
                    op == OPERATION::GREATER_EQUAL ||
-                    op == OPERATION::LESS_EQUAL
+                    op == OPERATION::LESS_EQUAL ||
+                    op == OPERATION::MOD ||
+                    op == OPERATION::FMOD
            );
        if (backendId == DNN_BACKEND_CUDA) {
            return op == OPERATION::MAX  || op == OPERATION::MIN  || op == OPERATION::SUM ||
                   op == OPERATION::PROD || op == OPERATION::DIV  || op == OPERATION::ADD ||
-                   op == OPERATION::SUB;
+                   op == OPERATION::SUB  || op == OPERATION::MOD || op == OPERATION::FMOD;
        }
        return backendId == DNN_BACKEND_OPENCV;
    }
@ -707,10 +723,16 @@ public:
            }
            case OPERATION::MOD:
            {
-                auto mod = [](const uint8_t &a, const uint8_t &b) { return a % b; };
+                auto mod = [] (const T &a, const T &b) { return static_cast<T>(_mod(int(a), int(b))); };
                binary_forward<T>(mod, std::forward<Args>(args)...);
                break;
            }
+            case OPERATION::FMOD:
+            {
+                auto fmod = [](const T &a, const T &b) { return std::fmod(a, b); };
+                binary_forward<T>(fmod, std::forward<Args>(args)...);
+                break;
+            }
            case OPERATION::PROD:
            {
                auto prod = [](const T &a, const T &b) { return a * b; };
@ -782,9 +804,8 @@ public:
                opDispatch<int32_t>(std::forward<Args>(args)...);
                break;
            case CV_32F:
-                CV_Assert(op != OPERATION::BITSHIFT && op != OPERATION::MOD &&
-                          op != OPERATION::AND && op != OPERATION::OR &&
-                          op != OPERATION::XOR);
+                CV_Assert(op != OPERATION::BITSHIFT && op != OPERATION::AND &&
+                          op != OPERATION::OR && op != OPERATION::XOR);
                opDispatch<float>(std::forward<Args>(args)...);
                break;
            default:
@ -801,19 +822,6 @@ public:
    {
        auto context = reinterpret_cast<csl::CSLContext*>(context_);

-        auto input_0_shape = inputs[0].dynamicCast<CUDABackendWrapper>()->getShape();
-        for (int i = 1; i < inputs.size(); i++)
-        {
-            auto input_i_shape = inputs[i].dynamicCast<CUDABackendWrapper>()->getShape();
-            if (input_0_shape.size() != input_i_shape.size())
-                return Ptr<BackendNode>();
-            // check if the shape can be supported by `eltwise_ops.cu`, or return the default BackendNode
-            for (int j = 0; j < input_0_shape.size(); j++)
-                if (input_0_shape[j] != input_i_shape[j] &&
-                    input_0_shape[j] != 1 && input_i_shape[j] != 1)
-                    return Ptr<BackendNode>();
-        }
-
        cuda4dnn::EltwiseOpType op_ = cuda4dnn::EltwiseOpType::SUM;
        switch (op) {
            case OPERATION::MAX:
@ -837,6 +845,12 @@ public:
            case OPERATION::SUB:
                op_ = cuda4dnn::EltwiseOpType::SUB;
                break;
+            case OPERATION::MOD:
+                op_ = cuda4dnn::EltwiseOpType::MOD;
+                break;
+            case OPERATION::FMOD:
+                op_ = cuda4dnn::EltwiseOpType::FMOD;
+                break;
            default: return Ptr<BackendNode>(); // return empty cuda_node if the EltwiseOpType is unsupported type.
        };

@ -881,6 +895,8 @@ public:
            BUILD_CANN_ELTWISE_OP(OPERATION::DIV,  Xdivy,   name);
            BUILD_CANN_ELTWISE_OP(OPERATION::MAX,  Maximum, name);
            BUILD_CANN_ELTWISE_OP(OPERATION::MIN,  Minimum, name);
+            BUILD_CANN_ELTWISE_OP(OPERATION::MOD,  Mod,     name);
+            BUILD_CANN_ELTWISE_OP(OPERATION::FMOD, Mod,     name);
 #undef BUILD_CANN_ELTWISE_OP
            default: CV_Error(Error::StsNotImplemented, "Unsupported eltwise operation");
        }
@ -927,6 +943,16 @@ public:
            node = std::make_shared<ngraph::op::v1::GreaterEqual>(inp0, inp1);
        else if (op == OPERATION::LESS_EQUAL)
            node = std::make_shared<ngraph::op::v1::LessEqual>(inp0, inp1);
+        // Ideally we should do this but int32 internal blobs are converted to float32 data type in inference.
+        // TODO: Remove data type convertion when we have type inference.
+        else if (op == OPERATION::MOD) {
+            auto inp0_i64 = std::make_shared<ngraph::op::Convert>(inp0, ngraph::element::i64);
+            auto inp1_i64 = std::make_shared<ngraph::op::Convert>(inp1, ngraph::element::i64);
+            auto mod = std::make_shared<ngraph::op::v1::FloorMod>(inp0_i64, inp1_i64);
+            node = std::make_shared<ngraph::op::Convert>(mod, ngraph::element::f32);
+        }
+        else if (op == OPERATION::FMOD)
+            node = std::make_shared<ngraph::op::v1::Mod>(inp0, inp1);
        else
            CV_Error(Error::StsNotImplemented, "Operation is not implemented for nGraph backend");
        return Ptr<BackendNode>(new InfEngineNgraphNode(node));
--- a/modules/dnn/src/layers/scatterND_layer.cpp
+++ b/modules/dnn/src/layers/scatterND_layer.cpp
@ -74,6 +74,11 @@ public:
        CV_TRACE_FUNCTION();
        CV_TRACE_ARG_VALUE(name, "name", name.c_str());

+        if (inputs_arr.depth() == CV_16S) {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
+
        std::vector<Mat> inputs, outputs;
        inputs_arr.getMatVector(inputs);
        outputs_arr.getMatVector(outputs);
@ -89,49 +94,59 @@ public:
    // NOTE: This impl does not check whether indices have duplicate entries.
    //       The last duplicate entry will overwrite the previous.
    template<typename T, typename Functor>
-    void forward_impl(const Functor& rd, const Mat& data, const Mat& indices, const Mat& updates, Mat& out)
-    {
-        data.copyTo(out);
+    void forward_impl(const Functor &reduce_operation, const Mat &input_mat, const Mat &indices_mat, const Mat &updates_mat, Mat& output_mat) {
+        input_mat.copyTo(output_mat);

-        const int* shape = data.size.p;
-        const size_t* step = data.step.p;
+        const auto &input_mat_shape = shape(input_mat);
+        std::vector<size_t> input_mat_step(input_mat_shape.size());
+        for (int i = 0; i < input_mat.dims; i++) {
+            input_mat_step[i] = static_cast<size_t>(input_mat.step.p[i] / sizeof(T));
+        }

-        const int ind_ndims = indices.dims;
-        const int* ind_shape = indices.size.p;
-        const T* p_indices = indices.ptr<const T>();
+        const int indices_mat_ndims = indices_mat.dims;
+        const auto &indices_mat_shape = shape(indices_mat);

-        const int upd_ndims = updates.dims;
-        const int* upd_shape = updates.size.p;
-        const T* p_updates = updates.ptr<const T>();
+        const int updates_mat_ndims = updates_mat.dims;
+        const auto &updates_mat_shape = shape(updates_mat);

-        T* p_out = out.ptr<T>();
-
-        int k = ind_shape[ind_ndims - 1]; // last dim of indices
-        size_t total = (size_t)(indices.total() / k);
+        int indices_last_dim = indices_mat_shape[indices_mat_ndims - 1]; // last dim of indices

        size_t updates_size = 1;
-        for (int i = ind_ndims - 1; i < upd_ndims; i++)
-            updates_size *= upd_shape[i];
+        for (int i = indices_mat_ndims - 1; i < updates_mat_ndims; i++)
+            updates_size *= updates_mat_shape[i];

-        size_t inp_start_offset = 0;
-        size_t ind_start_offset = 0;
-        size_t upd_start_offset = 0;
-        for (size_t i = 0; i < total; i++, ind_start_offset += k, upd_start_offset += updates_size)
-        {
-            const T* tmp_p_indices = p_indices + ind_start_offset;
-            inp_start_offset = 0;
-            for (int j = 0; j < k; j++)
-            {
-                CV_Assert(tmp_p_indices[j] < shape[j] && tmp_p_indices[j] > -shape[j]);
-                inp_start_offset += (((int)tmp_p_indices[j] + shape[j]) % shape[j]) * step[j];
-            }
-            inp_start_offset /= sizeof(T);
+        auto fn = [&](const Range &r) {
+            size_t input_offset = 0,
+                   indices_offset = r.start * indices_last_dim,
+                   updates_offset = r.start * updates_size;
+            for (int i = r.start; i < r.end; i++) {
+                const T* indices = indices_mat.ptr<const T>();
+                const T* updates = updates_mat.ptr<const T>();
+                T* output = output_mat.ptr<T>();

-            const T* tmp_p_updates = p_updates + upd_start_offset;
-            T* tmp_p_out = p_out + inp_start_offset;
-            for (int j = 0; j < updates_size; j++)
-                tmp_p_out[j] = rd(tmp_p_out[j], tmp_p_updates[j]);
+                input_offset = 0;
+                indices += indices_offset;
+                for (int j = 0; j < indices_last_dim; j++) {
+                    int index = static_cast<int>(*(indices + j));
+                    index = (index + input_mat_shape[j]) % input_mat_shape[j];
+                    CV_Assert(index < input_mat_shape[j] && index >= 0);
+                    input_offset += index * input_mat_step[j];
                }
+
+                updates += updates_offset;
+                output += input_offset;
+                for (int j = 0; j < updates_size; j++) {
+                    output[j] = reduce_operation(output[j], updates[j]);
+                }
+
+                indices_offset += indices_last_dim;
+                updates_offset += updates_size;
+            }
+        };
+
+        size_t total = (size_t)(indices_mat.total() / indices_last_dim);
+        double nstripes = (size_t)total * (indices_last_dim + updates_size) * (1 / 1024.0);
+        parallel_for_(Range(0, total), fn, nstripes);
    }

    template<typename... Args>
--- a/modules/dnn/src/layers/scatter_layer.cpp
+++ b/modules/dnn/src/layers/scatter_layer.cpp
@ -68,6 +68,11 @@ public:
        CV_TRACE_FUNCTION();
        CV_TRACE_ARG_VALUE(name, "name", name.c_str());

+        if (inputs_arr.depth() == CV_16S) {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
+
        std::vector<Mat> inputs, outputs;
        inputs_arr.getMatVector(inputs);
        outputs_arr.getMatVector(outputs);
@ -81,59 +86,62 @@ public:
    }

    template<typename T, typename Functor>
-    void forward_impl(const Functor& rd, const Mat& data, const Mat& indices, const Mat& updates, Mat& out)
-    {
-        data.copyTo(out);
+    void forward_impl(const Functor &reduce_operation, const Mat &input_mat, const Mat &indices_mat, const Mat &updates_mat, Mat &output_mat) {
+        input_mat.copyTo(output_mat);

-        const int ndims = data.dims;
-        const int* shape = data.size.p;
-        const size_t* step = data.step.p;
+        const int ndims = input_mat.dims;

-        const int* ind_shape = indices.size.p;
-        const size_t* ind_step = indices.step.p;
+        const auto &input_mat_shape = shape(input_mat);
+        std::vector<size_t> input_mat_step(ndims);

-        size_t inp_offset = 0;
-        size_t ind_offset = 0;
-        const T* p_index = indices.ptr<const T>();
-        const T* p_update = updates.ptr<const T>();
-        T* p_out = out.ptr<T>();
+        const auto &indices_mat_shape = shape(indices_mat);
+        std::vector<size_t> indices_mat_step(ndims);

-        size_t total = indices.total();
+        for (int i = 0; i < ndims; i++) {
+            input_mat_step[i] = static_cast<size_t>(input_mat.step.p[i] / sizeof(T));
+            indices_mat_step[i] = static_cast<size_t>(indices_mat.step.p[i] / sizeof(T));
+        }

-        int j, offset_at_idx, index;
-        size_t t, idx;
-        for (size_t i = 0; i < total; i++)
-        {
-            t = i;
-            inp_offset = 0;
-            ind_offset = 0;
-            int offset_at_axis = 0;
-            for (j = ndims - 1; j >= 0; j--)
-            {
-                idx = t / ind_shape[j];
-                offset_at_idx = (int)(t - idx * ind_shape[j]);
-                ind_offset += offset_at_idx * ind_step[j];
-                inp_offset += offset_at_idx * step[j];
-                t = idx;
-                if (j == axis)
-                {
-                    offset_at_axis = offset_at_idx * step[j];
+        auto fn = [&](const Range &r) {
+            size_t input_offset = 0, indices_offset = 0;
+
+            int indices_index, index;
+            size_t axis_offset, tmp_index, j_index;
+            for (int i = r.start; i < r.end; i++) {
+                const T* indices = indices_mat.ptr<const T>();
+                const T* updates = updates_mat.ptr<const T>();
+                T* output = output_mat.ptr<T>();
+
+                input_offset = 0;
+                indices_offset = 0;
+                indices_index = i;
+                axis_offset = 0;
+                for (int j = ndims - 1; j >= 0; j--) {
+                    tmp_index = indices_index / indices_mat_shape[j];
+                    j_index = (size_t)(indices_index - tmp_index * indices_mat_shape[j]);
+                    input_offset += j_index * input_mat_step[j];
+                    indices_offset += j_index * indices_mat_step[j];
+                    indices_index = tmp_index;
+                    if (j == axis) {
+                        axis_offset = j_index * input_mat_step[j];
                    }
                }
-            ind_offset /= sizeof(T);

                // get index and overwrite current indices
-            const T* tmp_p_index = p_index + ind_offset;
-            index = (int)(*tmp_p_index);
-            CV_Assert(index < shape[axis] && index > -shape[axis]);
+                index = static_cast<int>(*(indices + indices_offset));
+                index = (index + input_mat_shape[axis]) % input_mat_shape[axis];
+                CV_Assert(index < input_mat_shape[axis] && index >= 0);
+                input_offset = input_offset - axis_offset + index * input_mat_step[axis];

-            inp_offset = inp_offset - offset_at_axis + ((index + shape[axis]) % shape[axis]) * step[axis];
-            inp_offset /= sizeof(T);
-
-            const T* tmp_p_update = p_update + ind_offset;
-            T* tmp_p_out = p_out + inp_offset;
-            *tmp_p_out = rd(*tmp_p_out, *tmp_p_update);
+                updates += indices_offset;
+                output += input_offset;
+                *output = reduce_operation(*output, *updates);
            }
+        };
+
+        size_t total = indices_mat.total();
+        double nstripes = (size_t)total * ndims * (1 / 1024.0);
+        parallel_for_(Range(0, total), fn, nstripes);
    }

    template<typename... Args>
--- a/modules/dnn/src/net_impl.cpp
+++ b/modules/dnn/src/net_impl.cpp
@ -901,7 +901,6 @@ AsyncArray Net::Impl::forwardAsync(const String& outputName)
    CV_Assert(!empty());
    FPDenormalsIgnoreHintScope fp_denormals_ignore_scope;

-#ifdef CV_CXX11
    String layerName = outputName;

    if (layerName.empty())
@ -922,9 +921,6 @@ AsyncArray Net::Impl::forwardAsync(const String& outputName)
    isAsync = false;

    return getBlobAsync(layerName);
-#else
-    CV_Error(Error::StsNotImplemented, "DNN: Asynchronous forward requires build with enabled C++11");
-#endif  // CV_CXX11
 }


--- a/modules/dnn/src/net_impl.hpp
+++ b/modules/dnn/src/net_impl.hpp
@ -265,11 +265,9 @@ struct Net::Impl : public detail::NetImplBase

    Mat getBlob(String outputName) const;

-#ifdef CV_CXX11
    virtual AsyncArray getBlobAsync(const LayerPin& pin);

    AsyncArray getBlobAsync(String outputName);
-#endif  // CV_CXX11

    string dump(bool forceAllocation = false) const;

--- a/modules/dnn/src/net_impl_fuse.cpp
+++ b/modules/dnn/src/net_impl_fuse.cpp
@ -728,6 +728,10 @@ void Net::Impl::fuseLayers(const std::vector<LayerPin>& blobsToKeep_)
                    if(inp_i_data->skip || inp_i_data->consumers.size() != 1)
                        break;
 #ifdef HAVE_CUDA
+                    /* Risk: Not every operation in "NaryEltwise" is supported in the CUDA backend. There is a chance
+                             that Concat's output is filled with data in both host and device, leading to data missing.
+                             See https://github.com/opencv/opencv/issues/24721 for more details.
+                    */
                    if (preferableBackend == DNN_BACKEND_CUDA &&
                        (inp_i_data->layerInstance->supportBackend(DNN_BACKEND_CUDA) == false ||
                         (inp_i_data->layerInstance->type != "Convolution" &&
--- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_inner_product.cpp
+++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_inner_product.cpp
@ -97,8 +97,8 @@ bool OCL4DNNInnerProduct<Dtype>::Forward(const UMat& bottom,
                                           max_image_size);
        }

-        if (use_half_ && bias_term_)
-        {
+        if (bias_term_) {
+            if (use_half_) {
                UMat biasOneMat = UMat::ones(M_, 1, CV_32F);
                UMat newbias, tmpTop;

@ -106,6 +106,10 @@ bool OCL4DNNInnerProduct<Dtype>::Forward(const UMat& bottom,
                convertFp16(top, tmpTop);
                cv::gemm(biasOneMat, newbias, 1, tmpTop, 1, tmpTop, 0);
                convertFp16(tmpTop, top);
+            } else {
+                UMat biasOnesMat = UMat::ones(M_, 1, CV_32F);
+                cv::gemm(biasOnesMat, bias, 1, top, 1, top, 0);
+            }
        }

        return ret;
--- a/modules/dnn/src/onnx/onnx_graph_simplifier.cpp
+++ b/modules/dnn/src/onnx/onnx_graph_simplifier.cpp
@ -86,6 +86,7 @@ public:
    int getTensorShapeSize(int node_id, int node_input_id) {
        const auto node = getNode(node_id);
        const auto &input_name = node->getInputName(node_input_id);
+        // try to get from value_info
        for (int i = 0; i < net.value_info_size(); i++) {
            const auto value_info = net.value_info(i);
            if (value_info.name() == input_name) {
@ -97,6 +98,18 @@ public:
                }
            }
        }
+        // try to get from input
+        for (int i = 0; i < net.input_size(); i++) {
+            const auto input = net.input(i);
+            if (input.name() == input_name) {
+                if (input.has_type() && input.type().has_tensor_type() &&
+                    input.type().tensor_type().has_shape()) {
+                    return input.type().tensor_type().shape().dim_size();
+                } else {
+                    return -1;
+                }
+            }
+        }
        return -1;
    }

@ -660,6 +673,10 @@ private:
        [Input] -> LayerNorm -> [Output]
                        \
                    [weight], [bias]
+
+    Note: axes of ReduceMean must be:
+          - last element is the axis of last dimension (-1 or (input_ndims - 1))
+          - a list of adjacent axes, e.g. [1, 2, 3, ..., input_ndims - 1]
 */
 class LayerNormSubGraph : public Subgraph
 {
@ -683,19 +700,22 @@ public:
        setFusedNode("LayerNormalization", input);
    }

-    static float extractAxis(const Ptr<ImportGraphWrapper>& net, int node_id)
+    static std::vector<int64_t> extractAxis(const Ptr<ImportGraphWrapper>& net, int node_id)
    {
+        // TODO: consider ReduceMean-18 which has axes as one of the inputs instead of attributes
        Ptr<ImportNodeWrapper> mean_ptr = net->getNode(node_id);
        opencv_onnx::NodeProto* mean_node = mean_ptr.dynamicCast<ONNXNodeWrapper>()->node;
-        int axis_ = -1;
+        std::vector<int64_t> axes;
        for (int i = 0; i < mean_node->attribute_size(); i++)
        {
            opencv_onnx::AttributeProto attr = mean_node->attribute(i);
            if (attr.name() != "axes")
                continue;
-            axis_ = static_cast<int>(attr.ints(0));
+            for (int j = 0; j < attr.ints_size(); j++) {
+                axes.push_back(attr.ints(j));
            }
-        return axis_;
+        }
+        return axes;
    }

    virtual bool match(const Ptr<ImportGraphWrapper>& net, int nodeId,
@ -707,11 +727,31 @@ public:
            if (pow_exp - 2 > 1e-5) // not pow(2)
                return false;

-            int axis_mean1 = extractAxis(net, matchedNodesIds[mean]);
-            int axis_mean2 = extractAxis(net, matchedNodesIds[mean1]);
-            if (axis_mean1 != axis_mean2)
+            std::vector<int64_t> axes = extractAxis(net, matchedNodesIds[mean]);
+            // check whether it is -1 or last_axis or [axis, ..., last_axis]
+            int64_t input_ndims = static_cast<int64_t>(net.dynamicCast<ONNXGraphWrapper>()->getTensorShapeSize(matchedNodesIds[mean], 0));
+            if (input_ndims == -1) {
+                return false; // input shape unknown
+            }
+            // assume that axes are sorted in ascending order, e.g. [0, 1, 2, 3] or [-3, -2, -1]
+            if (axes.back() != -1 && axes.back() != (input_ndims - 1)) {
                return false;
-            axis = axis_mean1;
+            }
+            for (size_t i = 0; i < axes.size() - 1; i++) {
+                if (axes[i] - axes[i + 1] != -1) {
+                    return false;
+                }
+            }
+
+            std::vector<int64_t> axes1 = extractAxis(net, matchedNodesIds[mean1]);
+            if (axes.size() != axes1.size())
+                return false;
+            for (size_t i = 0; i < axes.size(); i++) {
+                if (((axes[i] + input_ndims) % input_ndims) != ((axes1[i] + input_ndims) % input_ndims)) {
+                    return false;
+                }
+            }
+            axis = axes[0];

            epsilon = extractConstant(net, matchedNodesIds[add], 1).at<float>(0);

--- a/modules/dnn/src/onnx/onnx_importer.cpp
+++ b/modules/dnn/src/onnx/onnx_importer.cpp
@ -22,6 +22,7 @@

 #ifdef HAVE_PROTOBUF

+#include <array>
 #include <iostream>
 #include <fstream>
 #include <string>
@ -2619,6 +2620,7 @@ void ONNXImporter::parseConcat(LayerParams& layerParams, const opencv_onnx::Node

        // Concat-1 has default value for axis is 1: https://github.com/onnx/onnx/blob/master/docs/Changelog.md#Concat-1
        int axis = layerParams.get<int>("axis", 1);
+        axis = normalize_axis(axis, inputShape.size());
        for (size_t i = 0; i < inputs.size(); ++i)
        {
            inputShape[axis] = inputs[i].dims == (int)inputShape.size() ? inputs[i].size[axis] : 1;
@ -2831,6 +2833,11 @@ void ONNXImporter::parseElementWise(LayerParams& layerParams, const opencv_onnx:

    layerParams.type = "NaryEltwise";
    layerParams.set("operation", toLowerCase(node_proto.op_type()));
+    if (node_proto.op_type() == "Mod") {
+        if (layerParams.get<int>("fmod", 0)) {
+            layerParams.set("operation", "fmod");
+        };
+    }

    // element-wise layers that can have >=1 inputs but actually have one input
    if (node_proto.input_size() == 1 && (op_type == "max" || op_type == "min" || op_type == "mean" || op_type == "sum"))
@ -4004,10 +4011,11 @@ void ONNXImporter::buildDispatchMap_ONNX_AI(int opset_version)
    dispatch["ScatterElements"] = dispatch["Scatter"] = dispatch["ScatterND"] = &ONNXImporter::parseScatter;
    dispatch["Tile"] = &ONNXImporter::parseTile;
    dispatch["LayerNormalization"] = &ONNXImporter::parseLayerNorm;
+    dispatch["GroupNormalization"] = &ONNXImporter::parseInstanceNormalization;

    dispatch["Equal"] = dispatch["Greater"] = dispatch["Less"] = dispatch["Pow"] = dispatch["Add"] =
            dispatch["Sub"] = dispatch["Mul"] = dispatch["Div"] = dispatch["GreaterOrEqual"] =
-            dispatch["LessOrEqual"] = &ONNXImporter::parseElementWise;
+            dispatch["LessOrEqual"] = dispatch["Mod"] = &ONNXImporter::parseElementWise;

    dispatch["Sum"] = dispatch["Min"] = dispatch["Max"] = &ONNXImporter::parseElementWise;
    dispatch["Where"] = &ONNXImporter::parseElementWise;
--- a/modules/dnn/src/opencl/mvn.cl
+++ b/modules/dnn/src/opencl/mvn.cl
@ -54,6 +54,7 @@
    #define vec_type Dtype8
    #define CALC_MEAN calc_mean8
    #define MVN mvn8
+    #define MVN_GROUP mvn_group8
    #define MEAN_FUSE mean_fuse8
    #define MVN_FUSE mvn_fuse8
 #elif NUM == 4
@ -62,6 +63,7 @@
    #define vec_type Dtype4
    #define CALC_MEAN calc_mean4
    #define MVN mvn4
+    #define MVN_GROUP mvn_group4
    #define MEAN_FUSE mean_fuse4
    #define MVN_FUSE mvn_fuse4
 #elif NUM == 1
@ -70,6 +72,7 @@
    #define vec_type Dtype
    #define CALC_MEAN calc_mean1
    #define MVN mvn1
+    #define MVN_GROUP mvn_group1
    #define MEAN_FUSE mean_fuse1
    #define MVN_FUSE mvn_fuse1
 #endif
@ -150,6 +153,54 @@ __kernel void MVN(__global const Dtype* src,
    store(dst_vec, dst, index);
 }

+#elif defined KERNEL_MVN_GROUP
+
+__kernel void MVN_GROUP(__global const Dtype* src,
+                            const int rows,
+                            const int cols,
+                            const Dtype eps,
+                            __global const Dtype* mean,
+                            __global const Dtype* dev,
+                            __global const Dtype* weight,
+                            __global const Dtype* bias,
+                            const int channels,
+                            const int num_groups,
+                            const float relu_slope,
+                            __global Dtype* dst)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * NUM;
+    int index = x * cols + y;
+
+    if (x >= rows || y >= cols)
+        return;
+
+    int group_size = channels / num_groups;
+    int step = norm_size / group_size;
+    int channel_index = x % num_groups * group_size + y / step
+    Dtype mean_val = mean[x];
+    Dtype dev_val = dev[x];
+    Dtype alpha;
+#ifdef NORM_VARIANCE
+    alpha = 1 / sqrt(eps + dev_val);
+#else
+    alpha = 1;
+#endif
+
+    Dtype w = weight[channel_index], b = bias[channel_index];
+
+    vec_type src_vec = load(src, index) - (vec_type)mean_val;
+    vec_type dst_vec = src_vec * alpha;
+    dst_vec = dst_vec * w + (vec_type)b;
+
+#ifdef FUSE_RELU
+    vec_type new_val = dst_vec * relu_slope;
+    dst_vec = select(new_val, dst_vec, dst_vec > (vec_type)0.f);
+#endif
+
+    store(dst_vec, dst, index);
+}
+
 #elif defined KERNEL_MEAN_FUSE

 __kernel void MEAN_FUSE(__global const T * A,
--- a/modules/dnn/test/test_backends.cpp
+++ b/modules/dnn/test/test_backends.cpp
@ -95,6 +95,12 @@ public:
    Net net;
 };

+TEST_P(DNNTestNetwork, DISABLED_YOLOv8n) {
+    processNet("dnn/onnx/models/yolov8n.onnx", "", Size(640, 640), "output0");
+    expectNoFallbacksFromIE(net);
+    expectNoFallbacksFromCUDA(net);
+}
+
 TEST_P(DNNTestNetwork, AlexNet)
 {
    applyTestTag(CV_TEST_TAG_MEMORY_1GB);
@ -1454,6 +1460,71 @@ INSTANTIATE_TEST_CASE_P(Layer_Test_Backends, Eltwise, testing::Combine(
               dnnBackendsAndTargets()
 ));

+////////////////////////////////////////////////////////////////////////////////
+// Element-wise layers
+////////////////////////////////////////////////////////////////////////////////
+using NaryEltwiseConcat = TestWithParam<tuple<std::vector<int>, tuple<Backend, Target>>>;
+TEST_P(NaryEltwiseConcat, Accuracy) {
+    auto param = GetParam();
+    std::vector<int> input_shape = get<0>(param);
+    auto backend_id = get<0>(get<1>(param));
+    auto target_id = get<1>(get<1>(param));
+
+    /* Build the following net:
+
+           <1x4x84>
+           /
+        [Input] -+-> Mul(B<1x84>) -> Concat(axis=1) -> [Output]
+                 |                     |
+                 +-> Sigmoid ----------+
+
+    */
+    Net net;
+
+    std::vector<int> mul_B_shape(input_shape.size() - 1, 1);
+    mul_B_shape.back() = input_shape.back();
+    Mat mul_B(mul_B_shape, CV_32FC1);
+    randn(mul_B, 0.f, 1.f);
+    LayerParams mul_B_lp;
+    mul_B_lp.name = "mul_B";
+    mul_B_lp.type = "Const";
+    mul_B_lp.blobs.push_back(mul_B);
+    int id_mul_B = net.addLayer(mul_B_lp.name, mul_B_lp.type, mul_B_lp);
+
+    LayerParams mul_lp;
+    mul_lp.name = "mul";
+    mul_lp.type = "NaryEltwise";
+    mul_lp.set("operation", "mul");
+    int id_mul = net.addLayer(mul_lp.name, mul_lp.type, mul_lp);
+    net.connect(0, 0, id_mul, 0);
+    net.connect(id_mul_B, 0, id_mul, 1);
+
+    LayerParams sigmoid_lp;
+    sigmoid_lp.name = "sigmoid";
+    sigmoid_lp.type = "Sigmoid";
+    int id_sigmoid = net.addLayer(sigmoid_lp.name, sigmoid_lp.type, sigmoid_lp);
+    net.connect(0, 0, id_sigmoid, 0);
+
+    LayerParams concat_lp;
+    concat_lp.name = "concat";
+    concat_lp.type = "Concat";
+    concat_lp.set("axis", 1);
+    int id_concat = net.addLayer(concat_lp.name, concat_lp.type, concat_lp);
+    net.connect(id_mul, 0, id_concat, 0);
+    net.connect(id_sigmoid, 0, id_concat, 1);
+
+    // Run test
+    Mat input(input_shape, CV_32FC1);
+    testLayer(input, net, backend_id, target_id, false);
+}
+
+INSTANTIATE_TEST_CASE_P(Layer_Test_Backends, NaryEltwiseConcat, testing::Combine(
+    testing::Values(std::vector<int>{1, 4, 84}),
+    dnnBackendsAndTargets())
+);
+
+
+
 INSTANTIATE_TEST_CASE_P(/*nothing*/, Test_layers_backends, dnnBackendsAndTargets());

 }} // namespace
--- a/modules/dnn/test/test_graph_simplifier.cpp
+++ b/modules/dnn/test/test_graph_simplifier.cpp
@ -47,6 +47,10 @@ TEST_F(Test_Graph_Simplifier, LayerNormSubGraph) {
    test("layer_norm_expanded_with_initializers", "LayerNormalization");
 }

+TEST_F(Test_Graph_Simplifier, LayerNormNoFusionSubGraph) {
+    test("layer_norm_no_fusion", std::vector<std::string>{"NaryEltwise", "Reduce", "Sqrt"});
+}
+
 TEST_F(Test_Graph_Simplifier, ResizeSubgraph) {
    /* Test for 6 subgraphs:
        - GatherCastSubgraph
--- a/modules/dnn/test/test_layers.cpp
+++ b/modules/dnn/test/test_layers.cpp
@ -2050,7 +2050,7 @@ private:
        net.setPreferableTarget(target);

        Mat re;
-        ASSERT_NO_THROW(re = net.forward()); // runtime error
+        re = net.forward();
        auto ptr_re = (float *) re.data;
        for (int i = 0; i < re.total(); i++)
            if (op == "sum"){
--- a/modules/dnn/test/test_misc.cpp
+++ b/modules/dnn/test/test_misc.cpp
@ -1033,14 +1033,10 @@ TEST_P(Test_two_inputs, basic)
    randu(firstInp, 0, 100);
    randu(secondInp, 0, 100);

-#ifndef CV_CXX11
    std::vector<String> input_names;
    input_names.push_back("data");
    input_names.push_back("second_input");
    net.setInputsNames(input_names);
-#else
-    net.setInputsNames({"data", "second_input"});
-#endif
    net.setInput(firstInp, "data", kScale);
    net.setInput(secondInp, "second_input", kScaleInv);
    net.setPreferableBackend(backendId);
--- a/modules/dnn/test/test_onnx_conformance.cpp
+++ b/modules/dnn/test/test_onnx_conformance.cpp
@ -311,6 +311,8 @@ static const TestCase testConformanceConfig[] = {
    {"test_gridsample_nearest", 2, 1},
    {"test_gridsample_reflection_padding", 2, 1},
    {"test_gridsample_zeros_padding", 2, 1},
+    {"test_group_normalization_epsilon", 3, 1},
+    {"test_group_normalization_example", 3, 1},
    {"test_gru_batchwise", 3, 2},
    {"test_gru_defaults", 3, 1},
    {"test_gru_seq_length", 4, 1},
--- a/modules/dnn/test/test_onnx_conformance_layer_filter__openvino.inl.hpp
+++ b/modules/dnn/test/test_onnx_conformance_layer_filter__openvino.inl.hpp
@ -736,6 +736,10 @@ CASE(test_gridsample_reflection_padding)
    // no filter
 CASE(test_gridsample_zeros_padding)
    // no filter
+CASE(test_group_normalization_epsilon)
+    // no filter
+CASE(test_group_normalization_example)
+    // no filter
 CASE(test_gru_batchwise)
    // no filter
 CASE(test_gru_defaults)
@ -1056,10 +1060,25 @@ CASE(test_mod_int64_fmod)
    // no filter
 CASE(test_mod_mixed_sign_float16)
    // no filter
+    if (target == DNN_TARGET_OPENCL)
+    {
+        default_l1 = 0.0011;  // Expected: (normL1) <= (l1), actual: 0.00104141 vs 1e-05
+        default_lInf = 0.0016;  // Expected: (normInf) <= (lInf), actual: 0.00156212 vs 0.0001
+    }
 CASE(test_mod_mixed_sign_float32)
    // no filter
+    if (target == DNN_TARGET_OPENCL)
+    {
+        default_l1 = 0.0011;  // Expected: (normL1) <= (l1), actual: 0.00104141 vs 1e-05
+        default_lInf = 0.0016;  // Expected: (normInf) <= (lInf), actual: 0.00156212 vs 0.0001
+    }
 CASE(test_mod_mixed_sign_float64)
    // no filter
+    if (target == DNN_TARGET_OPENCL)
+    {
+        default_l1 = 0.0011;  // Expected: (normL1) <= (l1), actual: 0.00104167 vs 1e-05
+        default_lInf = 0.0016;  // Expected: (normInf) <= (lInf), actual: 0.00156251 vs 0.0001
+    }
 CASE(test_mod_mixed_sign_int16)
    // no filter
 CASE(test_mod_mixed_sign_int32)
--- a/modules/dnn/test/test_onnx_conformance_layer_filter_opencv_all_denylist.inl.hpp
+++ b/modules/dnn/test/test_onnx_conformance_layer_filter_opencv_all_denylist.inl.hpp
@ -41,7 +41,7 @@
 "test_cast_STRING_to_FLOAT",
 "test_castlike_FLOAT_to_STRING_expanded",
 "test_castlike_STRING_to_FLOAT_expanded",
-"test_concat_1d_axis_negative_1",
+"test_concat_1d_axis_negative_1", // 1d support is required
 "test_div_uint8",  // output type mismatch
 "test_maxpool_2d_dilations",
 "test_maxpool_2d_same_lower",
--- a/modules/dnn/test/test_onnx_conformance_layer_parser_denylist.inl.hpp
+++ b/modules/dnn/test/test_onnx_conformance_layer_parser_denylist.inl.hpp
@ -210,9 +210,6 @@
 "test_min_uint8",
 "test_mod_broadcast",
 "test_mod_int64_fmod",
-"test_mod_mixed_sign_float16",
-"test_mod_mixed_sign_float32",
-"test_mod_mixed_sign_float64",
 "test_mod_mixed_sign_int16",
 "test_mod_mixed_sign_int32",
 "test_mod_mixed_sign_int64",
--- a/modules/dnn/test/test_onnx_importer.cpp
+++ b/modules/dnn/test/test_onnx_importer.cpp
@ -2673,24 +2673,36 @@ void yoloPostProcessing(
        cv::transposeND(outs[0], {0, 2, 1}, outs[0]);
    }

-    // each row is [cx, cy, w, h, conf_obj, conf_class1, ..., conf_class80]
+    if (test_name == "yolonas"){
+        // outs contains 2 elemets of shape [1, 8400, 80] and [1, 8400, 4]. Concat them to get [1, 8400, 84]
+        Mat concat_out;
+        // squeeze the first dimension
+        outs[0] = outs[0].reshape(1, outs[0].size[1]);
+        outs[1] = outs[1].reshape(1, outs[1].size[1]);
+        cv::hconcat(outs[1], outs[0], concat_out);
+        outs[0] = concat_out;
+        // remove the second element
+        outs.pop_back();
+        // unsqueeze the first dimension
+        outs[0] = outs[0].reshape(0, std::vector<int>{1, 8400, 84});
+    }
+
    for (auto preds : outs){

        preds = preds.reshape(1, preds.size[1]); // [1, 8400, 85] -> [8400, 85]
-
        for (int i = 0; i < preds.rows; ++i)
        {
-            // filter out non objects
-            float obj_conf = (test_name != "yolov8") ? preds.at<float>(i, 4) : 1.0f;
+            // filter out non object
+            float obj_conf = (test_name == "yolov8" || test_name == "yolonas") ? 1.0f : preds.at<float>(i, 4) ;
            if (obj_conf < conf_threshold)
                continue;

-            Mat scores = preds.row(i).colRange((test_name != "yolov8") ? 5 : 4, preds.cols);
+            Mat scores = preds.row(i).colRange((test_name == "yolov8" || test_name == "yolonas") ? 4 : 5, preds.cols);
            double conf;
            Point maxLoc;
            minMaxLoc(scores, 0, &conf, 0, &maxLoc);

-            conf = (test_name != "yolov8") ? conf * obj_conf : conf;
+            conf = (test_name == "yolov8" || test_name == "yolonas") ? conf : conf * obj_conf;
            if (conf < conf_threshold)
                continue;

@ -2701,9 +2713,14 @@ void yoloPostProcessing(
            double w = det[2];
            double h = det[3];

+            // std::cout << "cx: " << cx << " cy: " << cy << " w: " << w << " h: " << h << " conf: " << conf << " idx: " << maxLoc.x << std::endl;
            // [x1, y1, x2, y2]
+            if (test_name == "yolonas"){
+                boxes.push_back(Rect2d(cx, cy, w, h));
+            } else {
                boxes.push_back(Rect2d(cx - 0.5 * w, cy - 0.5 * h,
                                        cx + 0.5 * w, cy + 0.5 * h));
+            }
           classIds.push_back(maxLoc.x);
            confidences.push_back(conf);
        }
@ -2758,6 +2775,41 @@ TEST_P(Test_ONNX_nets, YOLOX)
        1.0e-4, 1.0e-4);
 }

+TEST_P(Test_ONNX_nets, YOLONas)
+{
+    // model information: https://dl.opencv.org/models/yolo-nas/Readme.md
+    std::string weightPath = _tf("models/yolo_nas_s.onnx", false);
+
+    Size targetSize{640, 640};
+    float conf_threshold = 0.50;
+    float iou_threshold = 0.50;
+
+    std::vector<int> refClassIds{1, 16, 7};
+    std::vector<float> refScores{0.9720f, 0.9283f, 0.8990f};
+    // [x1, y1, x2, y2]
+    std::vector<Rect2d> refBoxes{
+        Rect2d(105.516, 173.696, 471.323, 430.433),
+        Rect2d(109.241, 263.406, 259.872, 531.858),
+        Rect2d(390.153, 142.492, 574.932, 222.709)
+        };
+
+    Image2BlobParams imgParams(
+        Scalar::all(1/255.0),
+        targetSize,
+        Scalar::all(0),
+        false,
+        CV_32F,
+        DNN_LAYOUT_NCHW,
+        DNN_PMODE_LETTERBOX,
+        Scalar::all(114)
+        );
+
+    testYOLO(
+        weightPath, refClassIds, refScores, refBoxes,
+        imgParams, conf_threshold, iou_threshold,
+        1.0e-4, 1.0e-4, "yolonas");
+}
+
 TEST_P(Test_ONNX_nets, YOLOv8)
 {
    std::string weightPath = _tf("models/yolov8n.onnx", false);
@ -2804,7 +2856,7 @@ TEST_P(Test_ONNX_nets, YOLOv7)
        CV_TEST_TAG_DEBUG_VERYLONG
    );

-    std::string weightPath = _tf("models/yolov7_not_simplified.onnx", false);
+    std::string weightPath = _tf("models/yolov7.onnx", false);
    // Reference, which is collected with input size of 640x640
    std::vector<int> refClassIds{1, 16, 7};
    std::vector<float> refScores{0.9614331f, 0.9589417f, 0.8679074f};
@ -3031,6 +3083,10 @@ TEST_P(Test_ONNX_nets, VitTrack) {
    normAssert(ref_output3, outputs[2], "VitTrack output3");
 }

+TEST_P(Test_ONNX_layers, LayerNormNoFusion) {
+    testONNXModels("layer_norm_no_fusion");
+}
+
 INSTANTIATE_TEST_CASE_P(/**/, Test_ONNX_nets, dnnBackendsAndTargets());

 }} // namespace
--- a/modules/gapi/src/backends/ie/giebackend.cpp
+++ b/modules/gapi/src/backends/ie/giebackend.cpp
@ -2,7 +2,7 @@
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 //
-// Copyright (C) 2018-2023 Intel Corporation
+// Copyright (C) 2018-2024 Intel Corporation

 #include "precomp.hpp"

@ -10,7 +10,7 @@
 // (cv::gapi::ie::backend() is still there and is defined always)
 #include "backends/ie/giebackend.hpp"

-#ifdef HAVE_INF_ENGINE
+#if defined HAVE_INF_ENGINE && INF_ENGINE_RELEASE < 2024000000

 #if INF_ENGINE_RELEASE <= 2019010000
 #   error G-API IE module supports only OpenVINO IE >= 2019 R1
--- a/modules/gapi/src/backends/ie/giebackend.hpp
+++ b/modules/gapi/src/backends/ie/giebackend.hpp
@ -2,7 +2,7 @@
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 //
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2018-2024 Intel Corporation

 #ifndef OPENCV_GAPI_GIEBACKEND_HPP
 #define OPENCV_GAPI_GIEBACKEND_HPP
@ -10,7 +10,7 @@
 // Include anyway - cv::gapi::ie::backend() still needs to be defined
 #include "opencv2/gapi/infer/ie.hpp"

-#ifdef HAVE_INF_ENGINE
+#if defined HAVE_INF_ENGINE && INF_ENGINE_RELEASE < 2024000000

 #include <ade/util/algorithm.hpp> // type_list_index
 #include <condition_variable>
--- a/modules/gapi/src/backends/ie/giebackend/giewrapper.cpp
+++ b/modules/gapi/src/backends/ie/giebackend/giewrapper.cpp
@ -2,9 +2,9 @@
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 //
-// Copyright (C) 2020 Intel Corporation
+// Copyright (C) 2020-2024 Intel Corporation

-#ifdef HAVE_INF_ENGINE
+#if defined HAVE_INF_ENGINE && INF_ENGINE_RELEASE < 2024000000

 #include <vector>
 #include <string>
--- a/modules/gapi/test/infer/gapi_infer_ie_test.cpp
+++ b/modules/gapi/test/infer/gapi_infer_ie_test.cpp
@ -6,7 +6,7 @@

 #include "../test_precomp.hpp"

-#ifdef HAVE_INF_ENGINE
+#if defined HAVE_INF_ENGINE && INF_ENGINE_RELEASE < 2024000000

 #include <stdexcept>
 #include <mutex>
--- a/modules/highgui/src/window_QT.cpp
+++ b/modules/highgui/src/window_QT.cpp
@ -1662,14 +1662,14 @@ CvWindow::CvWindow(QString name, int arg2)

    //Now attach everything
    if (myToolBar)
-        myGlobalLayout->addWidget(myToolBar, Qt::AlignCenter);
+        myGlobalLayout->addWidget(myToolBar, 0, Qt::AlignLeft);

-    myGlobalLayout->addWidget(myView->getWidget(), Qt::AlignCenter);
+    myGlobalLayout->addWidget(myView->getWidget(), 0, Qt::AlignCenter);

-    myGlobalLayout->addLayout(myBarLayout, Qt::AlignCenter);
+    myGlobalLayout->addLayout(myBarLayout);

    if (myStatusBar)
-        myGlobalLayout->addWidget(myStatusBar, Qt::AlignCenter);
+        myGlobalLayout->addWidget(myStatusBar, 0, Qt::AlignLeft);

    setLayout(myGlobalLayout);
    show();
@ -2079,7 +2079,6 @@ void CvWindow::createStatusBar()
 {
    myStatusBar = new QStatusBar(this);
    myStatusBar->setSizeGripEnabled(false);
-    myStatusBar->setFixedHeight(20);
    myStatusBar->setMinimumWidth(1);
    myStatusBar_msg = new QLabel;

--- a/modules/imgcodecs/src/grfmt_jpeg.cpp
+++ b/modules/imgcodecs/src/grfmt_jpeg.cpp
@ -409,7 +409,9 @@ bool  JpegDecoder::readData( Mat& img )
    {
        jpeg_decompress_struct* cinfo = &((JpegState*)m_state)->cinfo;
        JpegErrorMgr* jerr = &((JpegState*)m_state)->jerr;
+#ifndef JCS_EXTENSIONS
        JSAMPARRAY buffer = 0;
+#endif

        if( setjmp( jerr->setjmp_buffer ) == 0 )
        {
@ -429,6 +431,18 @@ bool  JpegDecoder::readData( Mat& img )
            }
 #endif

+#ifdef JCS_EXTENSIONS
+            if( color )
+            {
+                cinfo->out_color_space = JCS_EXT_BGR;
+                cinfo->out_color_components = 3;
+            }
+            else
+            {
+                cinfo->out_color_space = JCS_GRAYSCALE;
+                cinfo->out_color_components = 1;
+            }
+#else
            if( color )
            {
                if( cinfo->num_components != 4 )
@ -455,6 +469,7 @@ bool  JpegDecoder::readData( Mat& img )
                    cinfo->out_color_components = 4;
                }
            }
+#endif

            // Check for Exif marker APP1
            jpeg_saved_marker_ptr exif_marker = NULL;
@ -481,12 +496,17 @@ bool  JpegDecoder::readData( Mat& img )

            jpeg_start_decompress( cinfo );

+#ifndef JCS_EXTENSIONS
            buffer = (*cinfo->mem->alloc_sarray)((j_common_ptr)cinfo,
                                              JPOOL_IMAGE, m_width*4, 1 );
+#endif

            uchar* data = img.ptr();
            for( ; m_height--; data += step )
            {
+#ifdef JCS_EXTENSIONS
+                jpeg_read_scanlines( cinfo, &data, 1 );
+#else
                jpeg_read_scanlines( cinfo, buffer, 1 );
                if( color )
                {
@ -502,6 +522,7 @@ bool  JpegDecoder::readData( Mat& img )
                    else
                        icvCvt_CMYK2Gray_8u_C4C1R( buffer[0], 0, data, 0, Size(m_width,1) );
                }
+#endif
            }

            result = true;
@ -593,8 +614,11 @@ bool JpegEncoder::write( const Mat& img, const std::vector<int>& params )
    int width = img.cols, height = img.rows;

    std::vector<uchar> out_buf(1 << 12);
+
+#ifndef JCS_EXTENSIONS
    AutoBuffer<uchar> _buffer;
    uchar* buffer;
+#endif

    struct jpeg_compress_struct cinfo;
    JpegErrorMgr jerr;
@ -629,8 +653,15 @@ bool JpegEncoder::write( const Mat& img, const std::vector<int>& params )

        int _channels = img.channels();
        int channels = _channels > 1 ? 3 : 1;
+
+#ifdef JCS_EXTENSIONS
+        cinfo.input_components = _channels;
+        cinfo.in_color_space = _channels == 3 ? JCS_EXT_BGR
+            : _channels == 4 ? JCS_EXT_BGRX : JCS_GRAYSCALE;
+#else
        cinfo.input_components = channels;
        cinfo.in_color_space = channels > 1 ? JCS_RGB : JCS_GRAYSCALE;
+#endif

        int quality = 95;
        int progressive = 0;
@ -746,14 +777,17 @@ bool JpegEncoder::write( const Mat& img, const std::vector<int>& params )

        jpeg_start_compress( &cinfo, TRUE );

+#ifndef JCS_EXTENSIONS
        if( channels > 1 )
            _buffer.allocate(width*channels);
        buffer = _buffer.data();
+#endif

        for( int y = 0; y < height; y++ )
        {
            uchar *data = img.data + img.step*y, *ptr = data;

+#ifndef JCS_EXTENSIONS
            if( _channels == 3 )
            {
                icvCvt_BGR2RGB_8u_C3R( data, 0, buffer, 0, Size(width,1) );
@ -764,6 +798,7 @@ bool JpegEncoder::write( const Mat& img, const std::vector<int>& params )
                icvCvt_BGRA2BGR_8u_C4C3R( data, 0, buffer, 0, Size(width,1), 2 );
                ptr = buffer;
            }
+#endif

            jpeg_write_scanlines( &cinfo, &ptr, 1 );
        }
--- a/modules/imgcodecs/src/loadsave.cpp
+++ b/modules/imgcodecs/src/loadsave.cpp
@ -210,15 +210,8 @@ struct ImageCodecInitializer
 static
 ImageCodecInitializer& getCodecs()
 {
-#ifdef CV_CXX11
    static ImageCodecInitializer g_codecs;
    return g_codecs;
-#else
-    // C++98 doesn't guarantee correctness of multi-threaded initialization of static global variables
-    // (memory leak here is not critical, use C++11 to avoid that)
-    static ImageCodecInitializer* g_codecs = new ImageCodecInitializer();
-    return *g_codecs;
-#endif
 }

 /**
--- a/modules/imgcodecs/test/test_avif.cpp
+++ b/modules/imgcodecs/test/test_avif.cpp
@ -166,7 +166,7 @@ TEST_P(Imgcodecs_Avif_Image_EncodeDecodeSuite, imencode_imdecode) {
                 cv::Exception);
    return;
  }
-  bool result;
+  bool result = true;
  EXPECT_NO_THROW(
      result = cv::imencode(".avif", img_original, buf, encoding_params_););
  EXPECT_TRUE(result);
--- a/modules/imgproc/include/opencv2/imgproc.hpp
+++ b/modules/imgproc/include/opencv2/imgproc.hpp
@ -4490,7 +4490,7 @@ An example using applyColorMap function

 /** @brief Applies a GNU Octave/MATLAB equivalent colormap on a given image.

-@param src The source image, grayscale or colored of type CV_8UC1 or CV_8UC3.
+@param src The source image, grayscale or colored of type CV_8UC1 or CV_8UC3. If CV_8UC3, then the CV_8UC1 image is generated internally using cv::COLOR_BGR2GRAY.
@param dst The result is the colormapped source image. Note: Mat::create is called on dst.
@param colormap The colormap to apply, see #ColormapTypes
 */
@ -4498,8 +4498,8 @@ CV_EXPORTS_W void applyColorMap(InputArray src, OutputArray dst, int colormap);

 /** @brief Applies a user colormap on a given image.

-@param src The source image, grayscale or colored of type CV_8UC1 or CV_8UC3.
-@param dst The result is the colormapped source image. Note: Mat::create is called on dst.
+@param src The source image, grayscale or colored of type CV_8UC1 or CV_8UC3. If CV_8UC3, then the CV_8UC1 image is generated internally using cv::COLOR_BGR2GRAY.
+@param dst The result is the colormapped source image of the same number of channels as userColor. Note: Mat::create is called on dst.
@param userColor The colormap to apply of type CV_8UC1 or CV_8UC3 and size 256
 */
 CV_EXPORTS_W void applyColorMap(InputArray src, OutputArray dst, InputArray userColor);
--- a/modules/java/CMakeLists.txt
+++ b/modules/java/CMakeLists.txt
@ -17,7 +17,7 @@ ocv_add_module(java BINDINGS opencv_core opencv_imgproc PRIVATE_REQUIRED opencv_

 include(${CMAKE_CURRENT_SOURCE_DIR}/common.cmake)

-# UTILITY: glob specific sources and append them to list (type is in H, CPP, JAVA, AIDL)
+# UTILITY: glob specific sources and append them to list (type is in H, CPP, JAVA)
 macro(glob_more_specific_sources _type _root _output)
  unset(_masks)
  if(${_type} STREQUAL "H")
@ -26,8 +26,6 @@ macro(glob_more_specific_sources _type _root _output)
    set(_masks "${_root}/cpp/*.cpp")
  elseif(${_type} STREQUAL "JAVA")
    set(_masks "${_root}/java/*.java" "${_root}/java/*.java.in")
-  elseif(${_type} STREQUAL "AIDL")
-    set(_masks "${_root}/java/*.aidl")
  endif()
  if (_masks)
    file(GLOB _result ${_masks})
--- a/modules/java/android_sdk/android_gradle_lib/build.gradle
+++ b/modules/java/android_sdk/android_gradle_lib/build.gradle
@ -42,7 +42,6 @@ android {
        main {
            jniLibs.srcDirs = ['../../jni']
            java.srcDirs = ['src']  // TODO Use original files instead of copied into build directory
-            aidl.srcDirs = ['src']
            res.srcDirs = ['@OpenCV_SOURCE_DIR@/modules/java/android_sdk/android_gradle_lib/res']
            manifest.srcFile 'AndroidManifest.xml'
        }
--- a/modules/java/android_sdk/build.gradle.in
+++ b/modules/java/android_sdk/build.gradle.in
@ -121,8 +121,6 @@ android {
        targetCompatibility JavaVersion.VERSION_@ANDROID_GRADLE_JAVA_VERSION_INIT@
    }

-    @ANDROID_GRADLE_BUILD_FEATURE_AIDL@
-
    buildTypes {
        debug {
            packagingOptions {
@ -139,7 +137,6 @@ android {
    }

    buildFeatures {
-        aidl true
        prefabPublishing true
        buildConfig true
    }
@ -153,7 +150,6 @@ android {
        main {
            jniLibs.srcDirs = ['native/libs']
            java.srcDirs = ['java/src']
-            aidl.srcDirs = ['java/src']
            res.srcDirs = ['java/res']
            manifest.srcFile 'java/AndroidManifest.xml'
        }
--- a/modules/java/generator/android-21/java/org/opencv/android/JavaCamera2View.java
+++ b/modules/java/generator/android-21/java/org/opencv/android/JavaCamera2View.java
@ -46,6 +46,7 @@ public class JavaCamera2View extends CameraBridgeViewBase {
    protected ImageReader mImageReader;
    protected int mPreviewFormat = ImageFormat.YUV_420_888;
    protected int mRequestTemplate = CameraDevice.TEMPLATE_PREVIEW;
+    private int mFrameRotation;

    protected CameraDevice mCameraDevice;
    protected CameraCaptureSession mCaptureSession;
@ -86,8 +87,8 @@ public class JavaCamera2View extends CameraBridgeViewBase {
        }
    }

-    protected boolean initializeCamera() {
-        Log.i(LOGTAG, "initializeCamera");
+    protected boolean selectCamera() {
+        Log.i(LOGTAG, "selectCamera");
        CameraManager manager = (CameraManager) getContext().getSystemService(Context.CAMERA_SERVICE);
        try {
            String camList[] = manager.getCameraIdList();
@ -110,14 +111,10 @@ public class JavaCamera2View extends CameraBridgeViewBase {
                    }
                }
            }
-            if (mCameraID != null) {
-                Log.i(LOGTAG, "Opening camera: " + mCameraID);
-                manager.openCamera(mCameraID, mStateCallback, mBackgroundHandler);
-            } else { // make JavaCamera2View behaves in the same way as JavaCameraView
-                Log.i(LOGTAG, "Trying to open camera with the value (" + mCameraIndex + ")");
+            if (mCameraID == null) { // make JavaCamera2View behaves in the same way as JavaCameraView
+                Log.i(LOGTAG, "Selecting camera by index (" + mCameraIndex + ")");
                if (mCameraIndex < camList.length) {
                    mCameraID = camList[mCameraIndex];
-                    manager.openCamera(mCameraID, mStateCallback, mBackgroundHandler);
                } else {
                    // CAMERA_DISCONNECTED is used when the camera id is no longer valid
                    throw new CameraAccessException(CameraAccessException.CAMERA_DISCONNECTED);
@ -125,11 +122,11 @@ public class JavaCamera2View extends CameraBridgeViewBase {
            }
            return true;
        } catch (CameraAccessException e) {
-            Log.e(LOGTAG, "OpenCamera - Camera Access Exception", e);
+            Log.e(LOGTAG, "selectCamera - Camera Access Exception", e);
        } catch (IllegalArgumentException e) {
-            Log.e(LOGTAG, "OpenCamera - Illegal Argument Exception", e);
+            Log.e(LOGTAG, "selectCamera - Illegal Argument Exception", e);
        } catch (SecurityException e) {
-            Log.e(LOGTAG, "OpenCamera - Security Exception", e);
+            Log.e(LOGTAG, "selectCamera - Security Exception", e);
        }
        return false;
    }
@ -204,6 +201,7 @@ public class JavaCamera2View extends CameraBridgeViewBase {
            mImageReader.setOnImageAvailableListener(new ImageReader.OnImageAvailableListener() {
                @Override
                public void onImageAvailable(ImageReader reader) {
+
                    Image image = reader.acquireLatestImage();
                    if (image == null)
                        return;
@ -213,8 +211,9 @@ public class JavaCamera2View extends CameraBridgeViewBase {
                    assert (planes.length == 3);
                    assert (image.getFormat() == mPreviewFormat);

-                    JavaCamera2Frame tempFrame = new JavaCamera2Frame(image);
+                    RotatedCameraFrame tempFrame = new RotatedCameraFrame(new JavaCamera2Frame(image), mFrameRotation);
                    deliverAndDrawFrame(tempFrame);
+                    tempFrame.mFrame.release();
                    tempFrame.release();
                    image.close();
                }
@ -303,11 +302,22 @@ public class JavaCamera2View extends CameraBridgeViewBase {
    protected boolean connectCamera(int width, int height) {
        Log.i(LOGTAG, "setCameraPreviewSize(" + width + "x" + height + ")");
        startBackgroundThread();
-        initializeCamera();
+        selectCamera();
        try {
+            CameraManager manager = (CameraManager) getContext().getSystemService(Context.CAMERA_SERVICE);
+            CameraCharacteristics characteristics = manager.getCameraCharacteristics(mCameraID);
+            mFrameRotation = getFrameRotation(
+                    characteristics.get(CameraCharacteristics.LENS_FACING) == CameraCharacteristics.LENS_FACING_FRONT,
+                    characteristics.get(CameraCharacteristics.SENSOR_ORIENTATION));
+
            boolean needReconfig = calcPreviewSize(width, height);
+            if (mFrameRotation % 180 == 0) {
                mFrameWidth = mPreviewSize.getWidth();
                mFrameHeight = mPreviewSize.getHeight();
+            } else {
+                mFrameWidth = mPreviewSize.getHeight();
+                mFrameHeight = mPreviewSize.getWidth();
+            }

            if ((getLayoutParams().width == LayoutParams.MATCH_PARENT) && (getLayoutParams().height == LayoutParams.MATCH_PARENT))
                mScale = Math.min(((float)height)/mFrameHeight, ((float)width)/mFrameWidth);
@ -322,12 +332,16 @@ public class JavaCamera2View extends CameraBridgeViewBase {
                    mCaptureSession.close();
                    mCaptureSession = null;
                }
-                createCameraPreviewSession();
            }

            if (mFpsMeter != null) {
                mFpsMeter.setResolution(mFrameWidth, mFrameHeight);
            }
+
+            Log.i(LOGTAG, "Opening camera: " + mCameraID);
+            manager.openCamera(mCameraID, mStateCallback, mBackgroundHandler);
+        } catch (CameraAccessException e) {
+            Log.e(LOGTAG, "OpenCamera - Camera Access Exception", e);
        } catch (RuntimeException e) {
            throw new RuntimeException("Interrupted while setCameraPreviewSize.", e);
        }
@ -442,6 +456,7 @@ public class JavaCamera2View extends CameraBridgeViewBase {
            mGray = new Mat();
        }

+        @Override
        public void release() {
            mRgba.release();
            mGray.release();
--- a/modules/java/generator/android-24/java/org/opencv/android/NativeCameraView.java
+++ b/modules/java/generator/android-24/java/org/opencv/android/NativeCameraView.java
@ -10,6 +10,7 @@ import org.opencv.videoio.VideoCapture;
 import org.opencv.videoio.VideoWriter;

 import android.content.Context;
+import android.hardware.Camera;
 import android.util.AttributeSet;
 import android.util.Log;
 import android.view.ViewGroup.LayoutParams;
@ -25,7 +26,7 @@ public class NativeCameraView extends CameraBridgeViewBase {
    private Thread mThread;

    protected VideoCapture mCamera;
-    protected NativeCameraFrame mFrame;
+    protected RotatedCameraFrame mFrame;

    public NativeCameraView(Context context, int cameraId) {
        super(context, cameraId);
@ -89,28 +90,65 @@ public class NativeCameraView extends CameraBridgeViewBase {

    private boolean initializeCamera(int width, int height) {
        synchronized (this) {
-
-            if (mCameraIndex == -1) {
+            Camera.CameraInfo cameraInfo = new Camera.CameraInfo();
+            int localCameraIndex = mCameraIndex;
+            if (mCameraIndex == CAMERA_ID_ANY) {
                Log.d(TAG, "Try to open default camera");
-                mCamera = new VideoCapture(0, Videoio.CAP_ANDROID);
-            } else {
-                Log.d(TAG, "Try to open camera with index " + mCameraIndex);
-                mCamera = new VideoCapture(mCameraIndex, Videoio.CAP_ANDROID);
+                localCameraIndex = 0;
+            } else if (mCameraIndex == CAMERA_ID_BACK) {
+                Log.i(TAG, "Trying to open back camera");
+                for (int camIdx = 0; camIdx < Camera.getNumberOfCameras(); ++camIdx) {
+                    Camera.getCameraInfo( camIdx, cameraInfo );
+                    if (cameraInfo.facing == Camera.CameraInfo.CAMERA_FACING_BACK) {
+                        localCameraIndex = camIdx;
+                        break;
                    }
+                }
+            } else if (mCameraIndex == CAMERA_ID_FRONT) {
+                Log.i(TAG, "Trying to open front camera");
+                for (int camIdx = 0; camIdx < Camera.getNumberOfCameras(); ++camIdx) {
+                    Camera.getCameraInfo( camIdx, cameraInfo );
+                    if (cameraInfo.facing == Camera.CameraInfo.CAMERA_FACING_FRONT) {
+                        localCameraIndex = camIdx;
+                        break;
+                    }
+                }
+            }
+
+            if (localCameraIndex == CAMERA_ID_BACK) {
+                Log.e(TAG, "Back camera not found!");
+                return false;
+            } else if (localCameraIndex == CAMERA_ID_FRONT) {
+                Log.e(TAG, "Front camera not found!");
+                return false;
+            }
+
+            Log.d(TAG, "Try to open camera with index " + localCameraIndex);
+            mCamera = new VideoCapture(localCameraIndex, Videoio.CAP_ANDROID);

            if (mCamera == null)
                return false;
-
            if (mCamera.isOpened() == false)
                return false;

-            mFrame = new NativeCameraFrame(mCamera);
+            if (mCameraIndex != CAMERA_ID_BACK && mCameraIndex != CAMERA_ID_FRONT)
+                Camera.getCameraInfo(localCameraIndex, cameraInfo);
+            int frameRotation = getFrameRotation(
+                    cameraInfo.facing == Camera.CameraInfo.CAMERA_FACING_FRONT,
+                    cameraInfo.orientation);
+
+            mFrame = new RotatedCameraFrame(new NativeCameraFrame(mCamera), frameRotation);

            mCamera.set(Videoio.CAP_PROP_FRAME_WIDTH, width);
            mCamera.set(Videoio.CAP_PROP_FRAME_HEIGHT, height);

-            mFrameWidth = (int)mCamera.get(Videoio.CAP_PROP_FRAME_WIDTH);
-            mFrameHeight = (int)mCamera.get(Videoio.CAP_PROP_FRAME_HEIGHT);
+            if (frameRotation % 180 == 0) {
+                mFrameWidth = (int) mCamera.get(Videoio.CAP_PROP_FRAME_WIDTH);
+                mFrameHeight = (int) mCamera.get(Videoio.CAP_PROP_FRAME_HEIGHT);
+            } else {
+                mFrameWidth = (int) mCamera.get(Videoio.CAP_PROP_FRAME_HEIGHT);
+                mFrameHeight = (int) mCamera.get(Videoio.CAP_PROP_FRAME_WIDTH);
+            }

            if ((getLayoutParams().width == LayoutParams.MATCH_PARENT) && (getLayoutParams().height == LayoutParams.MATCH_PARENT))
                mScale = Math.min(((float)height)/mFrameHeight, ((float)width)/mFrameWidth);
@ -131,7 +169,10 @@ public class NativeCameraView extends CameraBridgeViewBase {

    private void releaseCamera() {
        synchronized (this) {
-            if (mFrame != null) mFrame.release();
+            if (mFrame != null) {
+                mFrame.mFrame.release();
+                mFrame.release();
+            }
            if (mCamera != null) mCamera.release();
        }
    }
@ -162,6 +203,7 @@ public class NativeCameraView extends CameraBridgeViewBase {
            mBgr = new Mat();
        }

+        @Override
        public void release() {
            if (mGray != null) mGray.release();
            if (mRgba != null) mRgba.release();
--- a/modules/java/generator/android/java/org/opencv/android/CameraBridgeViewBase.java
+++ b/modules/java/generator/android/java/org/opencv/android/CameraBridgeViewBase.java
@ -4,6 +4,7 @@ import java.util.List;

 import org.opencv.BuildConfig;
 import org.opencv.R;
+import org.opencv.core.Core;
 import org.opencv.core.Mat;
 import org.opencv.core.Size;

@ -17,8 +18,10 @@ import android.graphics.Canvas;
 import android.graphics.Rect;
 import android.util.AttributeSet;
 import android.util.Log;
+import android.view.Surface;
 import android.view.SurfaceHolder;
 import android.view.SurfaceView;
+import android.view.WindowManager;

 /**
 * This is a basic class, implementing the interaction with Camera and OpenCV library.
@ -189,8 +192,93 @@ public abstract class CameraBridgeViewBase extends SurfaceView implements Surfac
         * This method returns single channel gray scale Mat with frame
         */
        public Mat gray();
+
+        public void release();
    };

+    public class RotatedCameraFrame implements CvCameraViewFrame {
+        @Override
+        public Mat gray() {
+            if (mRotation != 0) {
+                Core.rotate(mFrame.gray(), mGrayRotated, getCvRotationCode(mRotation));
+                return mGrayRotated;
+            } else {
+                return mFrame.gray();
+            }
+        }
+
+        @Override
+        public Mat rgba() {
+            if (mRotation != 0) {
+                Core.rotate(mFrame.rgba(), mRgbaRotated, getCvRotationCode(mRotation));
+                return mRgbaRotated;
+            } else {
+                return mFrame.rgba();
+            }
+        }
+
+        private int getCvRotationCode(int degrees) {
+            if  (degrees == 90) {
+                return Core.ROTATE_90_CLOCKWISE;
+            } else if (degrees == 180) {
+                return Core.ROTATE_180;
+            } else {
+                return Core.ROTATE_90_COUNTERCLOCKWISE;
+            }
+        }
+
+        public RotatedCameraFrame(CvCameraViewFrame frame, int rotation) {
+            super();
+            mFrame = frame;
+            mRgbaRotated = new Mat();
+            mGrayRotated = new Mat();
+            mRotation = rotation;
+        }
+
+        @Override
+        public void release() {
+            mRgbaRotated.release();
+            mGrayRotated.release();
+        }
+
+        public CvCameraViewFrame mFrame;
+        private Mat mRgbaRotated;
+        private Mat mGrayRotated;
+        private int mRotation;
+    };
+
+    /**
+     * Calculates how to rotate camera frame to match current screen orientation
+     */
+    protected int getFrameRotation(boolean cameraFacingFront, int cameraSensorOrientation) {
+        WindowManager windowManager = (WindowManager) getContext().getSystemService(Context.WINDOW_SERVICE);
+        int screenOrientation = windowManager.getDefaultDisplay().getRotation();
+        int screenRotation = 0;
+        switch (screenOrientation) {
+            case Surface.ROTATION_0:
+                screenRotation = 0;
+                break;
+            case Surface.ROTATION_90:
+                screenRotation = 90;
+                break;
+            case Surface.ROTATION_180:
+                screenRotation = 180;
+                break;
+            case Surface.ROTATION_270:
+                screenRotation = 270;
+                break;
+        }
+
+        int frameRotation;
+        if (cameraFacingFront) {
+            frameRotation = (cameraSensorOrientation + screenRotation) % 360;
+        } else {
+            frameRotation = (cameraSensorOrientation - screenRotation + 360) % 360;
+        }
+
+        return frameRotation;
+    }
+
    public void surfaceChanged(SurfaceHolder arg0, int arg1, int arg2, int arg3) {
        Log.d(TAG, "call surfaceChanged event");
        synchronized(mSyncObject) {
--- a/modules/java/generator/android/java/org/opencv/android/JavaCameraView.java
+++ b/modules/java/generator/android/java/org/opencv/android/JavaCameraView.java
@ -10,9 +10,12 @@ import android.hardware.Camera.PreviewCallback;
 import android.os.Build;
 import android.util.AttributeSet;
 import android.util.Log;
+import android.view.Surface;
 import android.view.ViewGroup.LayoutParams;
+import android.view.WindowManager;

 import org.opencv.BuildConfig;
+import org.opencv.core.Core;
 import org.opencv.core.CvType;
 import org.opencv.core.Mat;
 import org.opencv.core.Size;
@ -39,7 +42,7 @@ public class JavaCameraView extends CameraBridgeViewBase implements PreviewCallb
    private boolean mStopThread;

    protected Camera mCamera;
-    protected JavaCameraFrame[] mCameraFrame;
+    protected RotatedCameraFrame[] mCameraFrame;
    private SurfaceTexture mSurfaceTexture;
    private int mPreviewFormat = ImageFormat.NV21;

@ -71,29 +74,21 @@ public class JavaCameraView extends CameraBridgeViewBase implements PreviewCallb
        boolean result = true;
        synchronized (this) {
            mCamera = null;
+            int cameraId = -1;

            if (mCameraIndex == CAMERA_ID_ANY) {
-                Log.d(TAG, "Trying to open camera with old open()");
-                try {
-                    mCamera = Camera.open();
-                }
-                catch (Exception e){
-                    Log.e(TAG, "Camera is not available (in use or does not exist): " + e.getLocalizedMessage());
-                }
-
-                if(mCamera == null && Build.VERSION.SDK_INT >= Build.VERSION_CODES.GINGERBREAD) {
                boolean connected = false;
                for (int camIdx = 0; camIdx < Camera.getNumberOfCameras(); ++camIdx) {
                    Log.d(TAG, "Trying to open camera with new open(" + Integer.valueOf(camIdx) + ")");
                    try {
                        mCamera = Camera.open(camIdx);
                        connected = true;
+                        cameraId = camIdx;
                    } catch (RuntimeException e) {
                        Log.e(TAG, "Camera #" + camIdx + "failed to open: " + e.getLocalizedMessage());
                    }
                    if (connected) break;
                }
-                }
            } else {
                if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.GINGERBREAD) {
                    int localCameraIndex = mCameraIndex;
@ -126,6 +121,7 @@ public class JavaCameraView extends CameraBridgeViewBase implements PreviewCallb
                        Log.d(TAG, "Trying to open camera with new open(" + Integer.valueOf(localCameraIndex) + ")");
                        try {
                            mCamera = Camera.open(localCameraIndex);
+                            cameraId = localCameraIndex;
                        } catch (RuntimeException e) {
                            Log.e(TAG, "Camera #" + localCameraIndex + "failed to open: " + e.getLocalizedMessage());
                        }
@ -136,6 +132,11 @@ public class JavaCameraView extends CameraBridgeViewBase implements PreviewCallb
            if (mCamera == null)
                return false;

+            android.hardware.Camera.CameraInfo info = new android.hardware.Camera.CameraInfo();
+            android.hardware.Camera.getCameraInfo(cameraId, info);
+            int frameRotation = getFrameRotation(
+                    info.facing == Camera.CameraInfo.CAMERA_FACING_FRONT,
+                    info.orientation);
            /* Now set camera parameters */
            try {
                Camera.Parameters params = mCamera.getParameters();
@ -176,8 +177,16 @@ public class JavaCameraView extends CameraBridgeViewBase implements PreviewCallb
                    mCamera.setParameters(params);
                    params = mCamera.getParameters();

+                    int rawFrameWidth = params.getPreviewSize().width;
+                    int rawFrameHeight = params.getPreviewSize().height;
+
+                    if (frameRotation % 180 == 0) {
                        mFrameWidth = params.getPreviewSize().width;
                        mFrameHeight = params.getPreviewSize().height;
+                    } else {
+                        mFrameWidth = params.getPreviewSize().height;
+                        mFrameHeight = params.getPreviewSize().width;
+                    }

                    if ((getLayoutParams().width == LayoutParams.MATCH_PARENT) && (getLayoutParams().height == LayoutParams.MATCH_PARENT))
                        mScale = Math.min(((float)height)/mFrameHeight, ((float)width)/mFrameWidth);
@ -196,14 +205,14 @@ public class JavaCameraView extends CameraBridgeViewBase implements PreviewCallb
                    mCamera.setPreviewCallbackWithBuffer(this);

                    mFrameChain = new Mat[2];
-                    mFrameChain[0] = new Mat(mFrameHeight + (mFrameHeight/2), mFrameWidth, CvType.CV_8UC1);
-                    mFrameChain[1] = new Mat(mFrameHeight + (mFrameHeight/2), mFrameWidth, CvType.CV_8UC1);
+                    mFrameChain[0] = new Mat(rawFrameHeight + (rawFrameHeight/2), rawFrameWidth, CvType.CV_8UC1);
+                    mFrameChain[1] = new Mat(rawFrameHeight + (rawFrameHeight/2), rawFrameWidth, CvType.CV_8UC1);

                    AllocateCache();

-                    mCameraFrame = new JavaCameraFrame[2];
-                    mCameraFrame[0] = new JavaCameraFrame(mFrameChain[0], mFrameWidth, mFrameHeight);
-                    mCameraFrame[1] = new JavaCameraFrame(mFrameChain[1], mFrameWidth, mFrameHeight);
+                    mCameraFrame = new RotatedCameraFrame[2];
+                    mCameraFrame[0] = new RotatedCameraFrame(new JavaCameraFrame(mFrameChain[0], rawFrameWidth, rawFrameHeight), frameRotation);
+                    mCameraFrame[1] = new RotatedCameraFrame(new JavaCameraFrame(mFrameChain[1], rawFrameWidth, rawFrameHeight), frameRotation);

                    if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.HONEYCOMB) {
                        mSurfaceTexture = new SurfaceTexture(MAGIC_TEXTURE_ID);
@ -240,7 +249,9 @@ public class JavaCameraView extends CameraBridgeViewBase implements PreviewCallb
                mFrameChain[1].release();
            }
            if (mCameraFrame != null) {
+                mCameraFrame[0].mFrame.release();
                mCameraFrame[0].release();
+                mCameraFrame[1].mFrame.release();
                mCameraFrame[1].release();
            }
        }
@ -336,6 +347,7 @@ public class JavaCameraView extends CameraBridgeViewBase implements PreviewCallb
            mRgba = new Mat();
        }

+        @Override
        public void release() {
            mRgba.release();
        }
--- a/modules/java/generator/gen_java.py
+++ b/modules/java/generator/gen_java.py
@ -1254,13 +1254,13 @@ JNIEXPORT void JNICALL Java_org_opencv_%(module)s_%(j_cls)s_delete
 def copy_java_files(java_files_dir, java_base_path, default_package_path='org/opencv/'):
    global total_files, updated_files
    java_files = []
-    re_filter = re.compile(r'^.+\.(java|aidl|kt)(.in)?$')
+    re_filter = re.compile(r'^.+\.(java|kt)(.in)?$')
    for root, dirnames, filenames in os.walk(java_files_dir):
       java_files += [os.path.join(root, filename) for filename in filenames if re_filter.match(filename)]
    java_files = [f.replace('\\', '/') for f in java_files]

    re_package = re.compile(r'^package +(.+);')
-    re_prefix = re.compile(r'^.+[\+/]([^\+]+).(java|aidl|kt)(.in)?$')
+    re_prefix = re.compile(r'^.+[\+/]([^\+]+).(java|kt)(.in)?$')
    for java_file in java_files:
        src = checkFileRemap(java_file)
        with open(src, 'r') as f:
--- a/modules/objdetect/include/opencv2/objdetect/barcode.hpp
+++ b/modules/objdetect/include/opencv2/objdetect/barcode.hpp
@ -27,7 +27,7 @@ public:
     * @param prototxt_path prototxt file path for the super resolution model
     * @param model_path model file path for the super resolution model
     */
-    CV_WRAP BarcodeDetector(const std::string &prototxt_path, const std::string &model_path);
+    CV_WRAP BarcodeDetector(CV_WRAP_FILE_PATH const std::string &prototxt_path, CV_WRAP_FILE_PATH const std::string &model_path);
    ~BarcodeDetector();

    /** @brief Decodes barcode in image once it's found by the detect() method.
--- a/modules/objdetect/include/opencv2/objdetect/face.hpp
+++ b/modules/objdetect/include/opencv2/objdetect/face.hpp
@ -82,8 +82,8 @@ public:
     *  @param backend_id the id of backend
     *  @param target_id the id of target device
     */
-    CV_WRAP static Ptr<FaceDetectorYN> create(const String& model,
-                                              const String& config,
+    CV_WRAP static Ptr<FaceDetectorYN> create(CV_WRAP_FILE_PATH const String& model,
+                                              CV_WRAP_FILE_PATH const String& config,
                                              const Size& input_size,
                                              float score_threshold = 0.9f,
                                              float nms_threshold = 0.3f,
@ -154,7 +154,7 @@ public:
     *  @param backend_id the id of backend
     *  @param target_id the id of target device
     */
-    CV_WRAP static Ptr<FaceRecognizerSF> create(const String& model, const String& config, int backend_id = 0, int target_id = 0);
+    CV_WRAP static Ptr<FaceRecognizerSF> create(CV_WRAP_FILE_PATH const String& model, CV_WRAP_FILE_PATH const String& config, int backend_id = 0, int target_id = 0);
 };

 //! @}
--- a/modules/objdetect/src/aruco/aruco_board.cpp
+++ b/modules/objdetect/src/aruco/aruco_board.cpp
@ -483,39 +483,44 @@ void CharucoBoardImpl::generateImage(Size outSize, OutputArray img, int marginSi
    Mat noMarginsImg =
        out.colRange(marginSize, out.cols - marginSize).rowRange(marginSize, out.rows - marginSize);

-    double totalLengthX, totalLengthY;
-    totalLengthX = squareLength * size.width;
-    totalLengthY = squareLength * size.height;
-
-    // proportional transformation
-    double xReduction = totalLengthX / double(noMarginsImg.cols);
-    double yReduction = totalLengthY / double(noMarginsImg.rows);
+    // the size of the chessboard square depends on the location of the chessboard
+    float pixInSquare = 0.f;
+    // the size of the chessboard in pixels
+    Size pixInChessboard(noMarginsImg.cols, noMarginsImg.rows);

    // determine the zone where the chessboard is placed
-    Mat chessboardZoneImg;
-    if(xReduction > yReduction) {
-        int nRows = int(totalLengthY / xReduction);
-        int rowsMargins = (noMarginsImg.rows - nRows) / 2;
-        chessboardZoneImg = noMarginsImg.rowRange(rowsMargins, noMarginsImg.rows - rowsMargins);
-    } else {
-        int nCols = int(totalLengthX / yReduction);
-        int colsMargins = (noMarginsImg.cols - nCols) / 2;
-        chessboardZoneImg = noMarginsImg.colRange(colsMargins, noMarginsImg.cols - colsMargins);
+    float pixInSquareX = (float)noMarginsImg.cols / (float)size.width;
+    float pixInSquareY = (float)noMarginsImg.rows / (float)size.height;
+    Point startChessboard(0, 0);
+    if (pixInSquareX <= pixInSquareY) {
+        // the width of "noMarginsImg" image determines the dimensions of the chessboard
+        pixInSquare = pixInSquareX;
+        pixInChessboard.height = cvRound(pixInSquare*size.height);
+        int rowsMargin = (noMarginsImg.rows - pixInChessboard.height) / 2;
+        startChessboard.y = rowsMargin;
    }
+    else {
+        // the height of "noMarginsImg" image determines the dimensions of the chessboard
+        pixInSquare = pixInSquareY;
+        pixInChessboard.width = cvRound(pixInSquare*size.width);
+        int colsMargin = (noMarginsImg.cols - pixInChessboard.width) / 2;
+        startChessboard.x = colsMargin;
+    }
+    // determine the zone where the chessboard is located
+    Mat chessboardZoneImg = noMarginsImg(Rect(startChessboard, pixInChessboard));

-    // determine the margins to draw only the markers
-    // take the minimum just to be sure
-    double squareSizePixels = min(double(chessboardZoneImg.cols) / double(size.width),
-                                  double(chessboardZoneImg.rows) / double(size.height));
+    // marker size in pixels
+    const float pixInMarker = markerLength/squareLength*pixInSquare;
+    // the size of the marker margin in pixels
+    const float pixInMarginMarker = 0.5f*(pixInSquare - pixInMarker);

-    double diffSquareMarkerLength = (squareLength - markerLength) / 2;
-    int diffSquareMarkerLengthPixels =
-        int(diffSquareMarkerLength * squareSizePixels / squareLength);
+    // determine the zone where the aruco markers are located
+    int endArucoX = cvRound(pixInSquare*(size.width-1)+pixInMarginMarker+pixInMarker);
+    int endArucoY = cvRound(pixInSquare*(size.height-1)+pixInMarginMarker+pixInMarker);
+    Mat arucoZone = chessboardZoneImg(Range(cvRound(pixInMarginMarker), endArucoY), Range(cvRound(pixInMarginMarker), endArucoX));

    // draw markers
-    Mat markersImg;
-    Board::Impl::generateImage(chessboardZoneImg.size(), markersImg, diffSquareMarkerLengthPixels, borderBits);
-    markersImg.copyTo(chessboardZoneImg);
+    Board::Impl::generateImage(arucoZone.size(), arucoZone, 0, borderBits);

    // now draw black squares
    for(int y = 0; y < size.height; y++) {
@ -527,12 +532,11 @@ void CharucoBoardImpl::generateImage(Size outSize, OutputArray img, int marginSi
                if(y % 2 != x % 2) continue; // white corner, dont do anything
            }

-            double startX, startY;
-            startX = squareSizePixels * double(x);
-            startY = squareSizePixels * double(y);
+            float startX = pixInSquare * float(x);
+            float startY = pixInSquare * float(y);

-            Mat squareZone = chessboardZoneImg.rowRange(int(startY), int(startY + squareSizePixels))
-                                 .colRange(int(startX), int(startX + squareSizePixels));
+            Mat squareZone = chessboardZoneImg(Range(cvRound(startY), cvRound(startY + pixInSquare)),
+                                               Range(cvRound(startX), cvRound(startX + pixInSquare)));

            squareZone.setTo(0);
        }
--- a/modules/objdetect/src/aruco/aruco_detector.cpp
+++ b/modules/objdetect/src/aruco/aruco_detector.cpp
@ -684,7 +684,7 @@ struct ArucoDetector::ArucoDetectorImpl {
        contours.clear();

        // sort candidates from big to small
-        std::sort(candidateTree.begin(), candidateTree.end());
+        std::stable_sort(candidateTree.begin(), candidateTree.end());
        // group index for each candidate
        vector<int> groupId(candidateTree.size(), -1);
        vector<vector<size_t> > groupedCandidates;
@ -728,11 +728,11 @@ struct ArucoDetector::ArucoDetectorImpl {

        for (vector<size_t>& grouped : groupedCandidates) {
            if (detectorParams.detectInvertedMarker) // if detectInvertedMarker choose smallest contours
-                std::sort(grouped.begin(), grouped.end(), [](const size_t &a, const size_t &b) {
+                std::stable_sort(grouped.begin(), grouped.end(), [](const size_t &a, const size_t &b) {
                    return a > b;
                });
            else // if detectInvertedMarker==false choose largest contours
-                std::sort(grouped.begin(), grouped.end());
+                std::stable_sort(grouped.begin(), grouped.end());
            size_t currId = grouped[0];
            isSelectedContours[currId] = true;
            for (size_t i = 1ull; i < grouped.size(); i++) {
@ -780,7 +780,7 @@ struct ArucoDetector::ArucoDetectorImpl {
        vector<int> idsTmp(ncandidates, -1);
        vector<int> rotated(ncandidates, 0);
        vector<uint8_t> validCandidates(ncandidates, 0);
-        vector<bool> was(ncandidates, false);
+        vector<uint8_t> was(ncandidates, false);
        bool checkCloseContours = true;

        int maxDepth = 0;
--- a/modules/objdetect/src/precomp.hpp
+++ b/modules/objdetect/src/precomp.hpp
@ -52,5 +52,7 @@
 #include "opencv2/core/private.hpp"

 #include <numeric>
+#include <array>
+#include <vector>

 #endif
--- a/modules/objdetect/src/qrcode.cpp
+++ b/modules/objdetect/src/qrcode.cpp
@ -15,6 +15,7 @@
 #include "quirc.h"
 #endif

+#include <array>
 #include <limits>
 #include <cmath>
 #include <queue>
--- a/modules/objdetect/test/test_charucodetection.cpp
+++ b/modules/objdetect/test/test_charucodetection.cpp
@ -771,6 +771,57 @@ TEST_P(CharucoBoard, testWrongSizeDetection)
    ASSERT_TRUE(detectedCharucoIds.empty());
 }

+TEST(CharucoBoardGenerate, issue_24806)
+{
+    aruco::Dictionary dict = aruco::getPredefinedDictionary(aruco::DICT_4X4_1000);
+    const float squareLength = 13.f, markerLength = 10.f;
+    const Size boardSize(7ull, 4ull);
+    const aruco::CharucoBoard board(boardSize, squareLength, markerLength, dict);
+    const int marginSize = 24;
+    Mat boardImg;
+
+    // generate chessboard image
+    board.generateImage(Size(400, 300), boardImg, marginSize);
+    // This condition checks that the width of the image determines the dimensions of the chessboard in this test
+    CV_Assert((float)(boardImg.cols) / (float)boardSize.width <=
+              (float)(boardImg.rows) / (float)boardSize.height);
+
+    // prepare data for chessboard image test
+    Mat noMarginsImg = boardImg(Range(marginSize, boardImg.rows - marginSize),
+                                Range(marginSize, boardImg.cols - marginSize));
+    const float pixInSquare = (float)(noMarginsImg.cols) / (float)boardSize.width;
+
+    Size pixInChessboard(cvRound(pixInSquare*boardSize.width), cvRound(pixInSquare*boardSize.height));
+    const Point startChessboard((noMarginsImg.cols - pixInChessboard.width) / 2,
+                                (noMarginsImg.rows - pixInChessboard.height) / 2);
+    Mat chessboardZoneImg = noMarginsImg(Rect(startChessboard, pixInChessboard));
+
+    // B - black pixel, W - white pixel
+    // chessboard corner 1:
+    // B W
+    // W B
+    Mat goldCorner1 = (Mat_<uint8_t>(2, 2) <<
+        0, 255,
+        255, 0);
+    // B - black pixel, W - white pixel
+    // chessboard corner 2:
+    // W B
+    // B W
+    Mat goldCorner2 = (Mat_<uint8_t>(2, 2) <<
+        255, 0,
+        0, 255);
+
+    // test chessboard corners in generated image
+    for (const Point3f& p: board.getChessboardCorners()) {
+        Point2f chessCorner(pixInSquare*(p.x/squareLength),
+                            pixInSquare*(p.y/squareLength));
+        Mat winCorner = chessboardZoneImg(Rect(Point(cvRound(chessCorner.x) - 1, cvRound(chessCorner.y) - 1), Size(2, 2)));
+        bool eq = (cv::countNonZero(goldCorner1 != winCorner) == 0) | (cv::countNonZero(goldCorner2 != winCorner) == 0);
+        ASSERT_TRUE(eq);
+    }
+    // TODO: fix aruco generateImage and add test aruco corners for generated image
+}
+
 // Temporary disabled in https://github.com/opencv/opencv/pull/24338
 // 5.x version produces conrnes with different shape than 4.x (32F_C2 instead of 2x 32FC1)
 TEST(Charuco, DISABLED_testSeveralBoardsWithCustomIds)
--- a/modules/objdetect/test/test_precomp.hpp
+++ b/modules/objdetect/test/test_precomp.hpp
@ -7,10 +7,6 @@
 #include "opencv2/ts.hpp"
 #include "opencv2/objdetect.hpp"

-#if defined CV_CXX11
-  #include <random>
-#else
-  #include <cstdlib>
-#endif
+#include <random>

 #endif
--- a/modules/objdetect/test/test_qrcode_encode.cpp
+++ b/modules/objdetect/test/test_qrcode_encode.cpp
@ -5,16 +5,6 @@
 #include "test_precomp.hpp"
 namespace opencv_test { namespace {

-#if !defined CV_CXX11
-// Wrapper for generating seeded random number via std::rand.
-template<unsigned Seed>
-class SeededRandFunctor {
-public:
-    SeededRandFunctor() { std::srand(Seed); }
-    int operator()(int i) { return std::rand() % (i + 1); }
-};
-#endif
-
 std::string encode_qrcode_images_name[] = {
        "version1_mode1.png", "version1_mode2.png", "version1_mode4.png",
        "version2_mode1.png", "version2_mode2.png", "version2_mode4.png",
--- a/Show More
+++ b/Show More