diff --git a/3rdparty/libtengine/tengine.cmake b/3rdparty/libtengine/tengine.cmake
deleted file mode 100644
index ee8f0cb86f..0000000000
--- a/3rdparty/libtengine/tengine.cmake
+++ /dev/null
@@ -1,80 +0,0 @@
-# COPYRIGHT
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# License); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-# Copyright (c) 2020, OPEN AI LAB
-# Author: qtang@openailab.com or https://github.com/BUG1989
-#         qli@openailab.com
-#         sqfu@openailab.com
-
-SET(TENGINE_COMMIT_VERSION "e89cf8870de2ff0a80cfe626c0b52b2a16fb302e")
-SET(OCV_TENGINE_DIR "${OpenCV_BINARY_DIR}/3rdparty/libtengine")
-SET(OCV_TENGINE_SOURCE_PATH "${OCV_TENGINE_DIR}/Tengine-${TENGINE_COMMIT_VERSION}")
-
-IF(EXISTS "${OCV_TENGINE_SOURCE_PATH}")
-	MESSAGE(STATUS "Tengine is exist already at: ${OCV_TENGINE_SOURCE_PATH}")
-
-	SET(Tengine_FOUND ON)
-	SET(BUILD_TENGINE ON)
-ELSE()
-	SET(OCV_TENGINE_FILENAME "${TENGINE_COMMIT_VERSION}.zip")#name
-	SET(OCV_TENGINE_URL "https://github.com/OAID/Tengine/archive/") #url
-	SET(tengine_md5sum 23f61ebb1dd419f1207d8876496289c5) #md5sum
-
-	ocv_download(FILENAME ${OCV_TENGINE_FILENAME}
-						HASH ${tengine_md5sum}
-						URL
-						"${OPENCV_TENGINE_URL}"
-						"$ENV{OPENCV_TENGINE_URL}"
-						"${OCV_TENGINE_URL}"
-						DESTINATION_DIR "${OCV_TENGINE_DIR}"
-						ID TENGINE
-						STATUS res
-						UNPACK RELATIVE_URL)
-
-	if (NOT res)
-		MESSAGE(STATUS "TENGINE DOWNLOAD FAILED. Turning Tengine_FOUND off.")
-		SET(Tengine_FOUND OFF)
-	else ()
-		MESSAGE(STATUS "TENGINE DOWNLOAD success . ")
-
-		SET(Tengine_FOUND ON)
-		SET(BUILD_TENGINE ON)
-	endif()
-ENDIF()
-
-if(BUILD_TENGINE)
-	SET(HAVE_TENGINE 1)
-
-	if(NOT ANDROID)
-		# linux system
-		if(CMAKE_SYSTEM_PROCESSOR STREQUAL arm)
-			   SET(TENGINE_TOOLCHAIN_FLAG "-march=armv7-a")
-		elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL aarch64) ## AARCH64
-			   SET(TENGINE_TOOLCHAIN_FLAG "-march=armv8-a")
-		endif()
-	endif()
-
-	SET(BUILT_IN_OPENCV ON) ## set for tengine compile discern .
-	SET(Tengine_INCLUDE_DIR  "${OCV_TENGINE_SOURCE_PATH}/include" CACHE INTERNAL "")
-	if(EXISTS "${OCV_TENGINE_SOURCE_PATH}/CMakeLists.txt")
-		add_subdirectory("${OCV_TENGINE_SOURCE_PATH}" "${OCV_TENGINE_DIR}/build")
-	else()
-		message(WARNING "TENGINE: Missing 'CMakeLists.txt' in source code package: ${OCV_TENGINE_SOURCE_PATH}")
-	endif()
-	SET(Tengine_LIB "tengine" CACHE INTERNAL "")
-endif()
diff --git a/3rdparty/readme.txt b/3rdparty/readme.txt
index c3068521e3..0e6ce1e05e 100644
--- a/3rdparty/readme.txt
+++ b/3rdparty/readme.txt
@@ -39,7 +39,9 @@ libspng               Portable Network Graphics library.
 libtiff               Tag Image File Format (TIFF) Software
                       Copyright (c) 1988-1997 Sam Leffler
                       Copyright (c) 1991-1997 Silicon Graphics, Inc.
-                      See libtiff home page http://www.libtiff.org/
+                      See libtiff home page #1 http://www.simplesystems.org/libtiff/
+                                            #2 https://libtiff.gitlab.io/libtiff/
+                                            #3 http://libtiff.maptools.org/
                       for details and links to the source code
 
                       WITH_TIFF CMake option must be ON to add libtiff & zlib support to imgcodecs.
diff --git a/CMakeLists.txt b/CMakeLists.txt
index caa8f83c99..bb87b483ff 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -463,9 +463,6 @@ OCV_OPTION(WITH_ANDROID_MEDIANDK "Use Android Media NDK for Video I/O (Android)"
 OCV_OPTION(WITH_ANDROID_NATIVE_CAMERA "Use Android NDK for Camera I/O (Android)" (ANDROID_NATIVE_API_LEVEL GREATER 23)
   VISIBLE_IF ANDROID
   VERIFY HAVE_ANDROID_NATIVE_CAMERA)
-OCV_OPTION(WITH_TENGINE "Include Arm Inference Tengine support" OFF
-  VISIBLE_IF (ARM OR AARCH64) AND (UNIX OR ANDROID) AND NOT IOS
-  VERIFY HAVE_TENGINE)
 OCV_OPTION(WITH_ONNX "Include Microsoft ONNX Runtime support" OFF
   VISIBLE_IF TRUE
   VERIFY HAVE_ONNX)
@@ -768,9 +765,6 @@ if(WITH_LAPACK)
 endif()
 include(cmake/OpenCVFindProtobuf.cmake)
 include(cmake/OpenCVDetectFlatbuffers.cmake)
-if(WITH_TENGINE)
-  include(cmake/OpenCVFindTengine.cmake)
-endif()
 if(WITH_TIMVX)
   include(cmake/OpenCVFindTIMVX.cmake)
 endif()
@@ -1623,10 +1617,6 @@ if(WITH_VA OR HAVE_VA)
   status("    VA:"            HAVE_VA          THEN "YES" ELSE NO)
 endif()
 
-if(WITH_TENGINE OR HAVE_TENGINE)
-  status("    Tengine:"      HAVE_TENGINE     THEN "YES (${TENGINE_LIBRARIES})" ELSE NO)
-endif()
-
 if(WITH_LAPACK OR HAVE_LAPACK)
   status("    Lapack:"      HAVE_LAPACK     THEN "YES (${LAPACK_LIBRARIES} ${LAPACK_VERSION})" ELSE NO)
 endif()
@@ -1693,6 +1683,10 @@ else()
   endif()
 endif()
 
+if(BUILD_opencv_dnn AND OPENCV_DNN_BACKEND_DEFAULT)
+    status("    Default DNN backend:" ${OPENCV_DNN_BACKEND_DEFAULT})
+endif()
+
 if(WITH_EIGEN OR HAVE_EIGEN)
   status("    Eigen:"      HAVE_EIGEN       THEN "YES (ver ${EIGEN_WORLD_VERSION}.${EIGEN_MAJOR_VERSION}.${EIGEN_MINOR_VERSION})" ELSE NO)
 endif()
diff --git a/apps/visualisation/opencv_visualisation.cpp b/apps/visualisation/opencv_visualisation.cpp
index 85e9697aad..9b7fcd9f48 100644
--- a/apps/visualisation/opencv_visualisation.cpp
+++ b/apps/visualisation/opencv_visualisation.cpp
@@ -60,6 +60,7 @@ Created by: Puttemans Steven - April 2016
 
 #include <fstream>
 #include <iostream>
+#include <sstream>
 
 using namespace std;
 using namespace cv;
diff --git a/cmake/OpenCVDetectPython.cmake b/cmake/OpenCVDetectPython.cmake
index 6e95fbf3a0..2c92e33eb6 100644
--- a/cmake/OpenCVDetectPython.cmake
+++ b/cmake/OpenCVDetectPython.cmake
@@ -251,7 +251,7 @@ if(NOT ${found})
   set(${include_path} "${_include_path}" CACHE INTERNAL "")
   set(${include_dir} "${_include_dir}" CACHE PATH "Python include dir")
   set(${include_dir2} "${_include_dir2}" CACHE PATH "Python include dir 2")
-  set(${packages_path} "${_packages_path}" CACHE PATH "Where to install the python packages.")
+  set(${packages_path} "${_packages_path}" CACHE STRING "Where to install the python packages.")
   set(${numpy_include_dirs} ${_numpy_include_dirs} CACHE PATH "Path to numpy headers")
   set(${numpy_version} "${_numpy_version}" CACHE INTERNAL "")
 endif()
diff --git a/cmake/OpenCVFindTengine.cmake b/cmake/OpenCVFindTengine.cmake
deleted file mode 100644
index 2d33f5c993..0000000000
--- a/cmake/OpenCVFindTengine.cmake
+++ /dev/null
@@ -1,78 +0,0 @@
-# COPYRIGHT
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# License); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-# Copyright (c) 2020, OPEN AI LAB
-# Author: qtang@openailab.com or https://github.com/BUG1989
-#
-
-# ----------------------------------------------------------------------------
-#  Path for Tengine binaries
-# ----------------------------------------------------------------------------
-set(OPENCV_LIBTENGINE_ROOT_DIR "" CACHE PATH "Path to TENGINE binaries installation")
-
-IF(OPENCV_LIBTENGINE_ROOT_DIR AND NOT BUILD_TENGINE)
-
-	MESSAGE(STATUS "TENGINE:--  Use binaries at ${OPENCV_LIBTENGINE_ROOT_DIR}")
-
-	SET(Tengine_FOUND ON)
-	set(BUILD_TENGINE OFF)
-
-	SET(Tengine_INCLUDE_DIR "${OPENCV_LIBTENGINE_ROOT_DIR}/include" CACHE PATH "TENGINE include dir")
-	SET(Tengine_LIB "${OPENCV_LIBTENGINE_ROOT_DIR}/lib/libtengine.a" CACHE PATH "TENGINE library dir")
-
-ELSE()
-	IF(ANDROID)
-		IF(OPENCV_TENGINE_FORCE_ANDROID)
-			# nothing, use Android
-		ELSEIF(OPENCV_TENGINE_SKIP_ANDROID)
-			set(Tengine_FOUND OFF)
-			set(HAVE_TENGINE FALSE)
-			return()
-		ELSEIF(NOT DEFINED ANDROID_NDK_REVISION)
-			MESSAGE(STATUS "Android NDK version Tengine not support: ANDROID_NDK_REVISION is not defined")
-			set(Tengine_FOUND OFF)
-			set(HAVE_TENGINE FALSE)
-			return()
-		ELSEIF(ANDROID_NDK_REVISION VERSION_LESS 14)
-			MESSAGE(STATUS "Android NDK version Tengine not support: ANDROID_NDK_REVISION=${ANDROID_NDK_REVISION}")
-			set(Tengine_FOUND OFF)
-			set(HAVE_TENGINE FALSE)
-			return()
-		ENDIF()
-	ENDIF()
-	MESSAGE(STATUS "TENGINE:--  Build Tengine from source code. ")
-	include("${OpenCV_SOURCE_DIR}/3rdparty/libtengine/tengine.cmake")
-ENDIF()
-
-IF(NOT Tengine_LIB)
-	SET(Tengine_FOUND OFF)
-	MESSAGE(STATUS "#### Could not find Tengine lib. Turning Tengine_FOUND off")
-ENDIF()
-
-IF (Tengine_FOUND)
-	MESSAGE(STATUS "Found Tengine include: ${Tengine_INCLUDE_DIR}")
-	MESSAGE(STATUS "Found Tengine libraries: ${Tengine_LIB}")
-	set(HAVE_TENGINE 1)
-	set(TENGINE_LIBRARIES    ${Tengine_LIB})
-	set(TENGINE_INCLUDE_DIRS    ${Tengine_INCLUDE_DIR})
-ENDIF (Tengine_FOUND)
-
-MARK_AS_ADVANCED(
-	Tengine_INCLUDE_DIR
-	Tengine_LIB
-)
diff --git a/cmake/mirrors/custom.cmake b/cmake/mirrors/custom.cmake
index 3cdf700e19..8c421471f3 100644
--- a/cmake/mirrors/custom.cmake
+++ b/cmake/mirrors/custom.cmake
@@ -1,15 +1,12 @@
 # Gitlab-style mirror
 # CMake scripts look for opencv/opencv_3rdparty,
-#  OAID/Tengine, 01org/tbb(oneAPI/oneTBB), opencv/ade
+#  01org/tbb(oneAPI/oneTBB), opencv/ade
 #  from OPENCV_DOWNLOAD_MIRROR
 ocv_update(OPENCV_DOWNLOAD_MIRROR_URL "")
 
 ######
 # Download via commit id
 ######
-# Tengine
-ocv_update(TENGINE_PKG_MD5_CUSTOM "")
-ocv_update(TENGINE_PKG_MD5_ORIGINAL 23f61ebb1dd419f1207d8876496289c5) # same as tengine_md5sum for TENGINE commit of e89cf8870de2ff0a80cfe626c0b52b2a16fb302e
 # NVIDIA_OPTICAL_FLOW
 ocv_update(NVIDIA_OPTICAL_FLOW_PKG_MD5_GITCODE "")
 ocv_update(NVIDIA_OPTICAL_FLOW_PKG_MD5_ORIGINAL a73cd48b18dcc0cc8933b30796074191)
@@ -77,7 +74,7 @@ else()
     ocv_download_url_custom_usercontent(opencv)
   elseif(DL_ID STREQUAL "wechat_qrcode")
     ocv_download_url_gitcode_usercontent(WeChatCV)
-  elseif((DL_ID STREQUAL "TENGINE") OR (DL_ID STREQUAL "NVIDIA_OPTICAL_FLOW") OR (DL_ID STREQUAL "TIM-VX"))
+  elseif((DL_ID STREQUAL "NVIDIA_OPTICAL_FLOW") OR (DL_ID STREQUAL "TIM-VX"))
     ocv_download_url_custom_archive_commit_id()
   elseif(DL_ID STREQUAL "TBB")
     ocv_download_url_custom_archive_release()
diff --git a/cmake/mirrors/gitcode.cmake b/cmake/mirrors/gitcode.cmake
index c9d41e7458..e208a87245 100644
--- a/cmake/mirrors/gitcode.cmake
+++ b/cmake/mirrors/gitcode.cmake
@@ -1,9 +1,6 @@
 ######
 # Download via commit id
 ######
-# Tengine
-ocv_update(TENGINE_PKG_MD5_GITCODE 1b5908632b557275cd6e85b0c03f9690)
-ocv_update(TENGINE_PKG_MD5_ORIGINAL 23f61ebb1dd419f1207d8876496289c5) # same as tengine_md5sum for TENGINE commit of e89cf8870de2ff0a80cfe626c0b52b2a16fb302e
 # NVIDIA_OPTICAL_FLOW
 ocv_update(NVIDIA_OPTICAL_FLOW_PKG_MD5_GITCODE 8d5b7eeb24d6ca9c6bcfdff4196d5b47)
 ocv_update(NVIDIA_OPTICAL_FLOW_PKG_MD5_ORIGINAL a73cd48b18dcc0cc8933b30796074191)
@@ -74,7 +71,7 @@ if((DL_ID STREQUAL "FFMPEG") OR (DL_ID STREQUAL "IPPICV") OR (DL_ID STREQUAL "da
   ocv_download_url_gitcode_usercontent(opencv)
 elseif(DL_ID STREQUAL "wechat_qrcode")
   ocv_download_url_gitcode_usercontent(mirrors/WeChatCV)
-elseif((DL_ID STREQUAL "TENGINE") OR (DL_ID STREQUAL "NVIDIA_OPTICAL_FLOW") OR (DL_ID STREQUAL "TIM-VX"))
+elseif((DL_ID STREQUAL "NVIDIA_OPTICAL_FLOW") OR (DL_ID STREQUAL "TIM-VX"))
   ocv_download_url_gitcode_archive_commit_id()
 elseif(DL_ID STREQUAL "TBB")
   ocv_download_url_gitcode_archive_release(OPENCV_TBB_SUBDIR)
diff --git a/doc/tutorials/introduction/config_reference/config_reference.markdown b/doc/tutorials/introduction/config_reference/config_reference.markdown
index 16acc315f5..84201f06c0 100644
--- a/doc/tutorials/introduction/config_reference/config_reference.markdown
+++ b/doc/tutorials/introduction/config_reference/config_reference.markdown
@@ -224,6 +224,16 @@ Following options can be used to produce special builds with instrumentation or
 @see [Link time optimization](https://gcc.gnu.org/wiki/LinkTimeOptimization)
 @see [ThinLTO](https://clang.llvm.org/docs/ThinLTO.html)
 
+## Enable IPP optimization
+
+Following options can be used to enables IPP optimizations for each functions but increases the size of the opencv library. All options are disabled by default.
+
+| Option | Functions | + roughly size |
+| -------| --------- | -------------- |
+| `OPENCV_IPP_GAUSSIAN_BLUR` | GaussianBlur() | +8Mb |
+| `OPENCV_IPP_MEAN` | mean() / meanStdDev() | +0.2Mb |
+| `OPENCV_IPP_MINMAX` | minMaxLoc() / minMaxIdx() | +0.2Mb |
+| `OPENCV_IPP_SUM` | sum() | +0.1Mb |
 
 # Functional features and dependencies {#tutorial_config_reference_func}
 
@@ -484,7 +494,6 @@ OpenCV have own DNN inference module which have own build-in engine, but can als
 | `OPENCV_DNN_CUDA` | _OFF_ | Enable CUDA backend. [CUDA](https://en.wikipedia.org/wiki/CUDA), CUBLAS and [CUDNN](https://developer.nvidia.com/cudnn) must be installed. |
 | `WITH_HALIDE` | _OFF_ | Use experimental [Halide](https://en.wikipedia.org/wiki/Halide_(programming_language)) backend which can generate optimized code for dnn-layers at runtime. Halide must be installed. |
 | `WITH_VULKAN` | _OFF_ | Enable experimental [Vulkan](https://en.wikipedia.org/wiki/Vulkan_(API)) backend. Does not require additional dependencies, but can use external Vulkan headers (`VULKAN_INCLUDE_DIRS`). |
-| `WITH_TENGINE` | _OFF_ | Enable experimental [Tengine](https://github.com/OAID/Tengine) backend for ARM CPUs. Tengine library must be installed. |
 
 
 # Installation layout {#tutorial_config_reference_install}
@@ -566,6 +575,7 @@ Following options can be used to change installation layout for common scenarios
 | ------ | ------- | ----------- |
 | `OPENCV_ENABLE_NONFREE` | _OFF_ | Some algorithms included in the library are known to be protected by patents and are disabled by default. |
 | `OPENCV_FORCE_3RDPARTY_BUILD`| _OFF_ | Enable all `BUILD_` options at once. |
+| `OPENCV_IPP_ENABLE_ALL`| _OFF_ | Enable all `OPENCV_IPP_` options at once. |
 | `ENABLE_CCACHE` | _ON_ (on Unix-like platforms) | Enable [ccache](https://en.wikipedia.org/wiki/Ccache) auto-detection. This tool wraps compiler calls and caches results, can significantly improve re-compilation time. |
 | `ENABLE_PRECOMPILED_HEADERS` | _ON_ (for MSVC) | Enable precompiled headers support. Improves build time. |
 | `BUILD_DOCS` | _OFF_ | Enable documentation build (_doxygen_, _doxygen_cpp_, _doxygen_python_, _doxygen_javadoc_ targets). [Doxygen](http://www.doxygen.org/index.html) must be installed for C++ documentation build. Python and [BeautifulSoup4](https://en.wikipedia.org/wiki/Beautiful_Soup_(HTML_parser)) must be installed for Python documentation build. Javadoc and Ant must be installed for Java documentation build (part of Java SDK). |
diff --git a/modules/3d/src/ap3p.cpp b/modules/3d/src/ap3p.cpp
index 34ac9c4323..79da0f13a7 100644
--- a/modules/3d/src/ap3p.cpp
+++ b/modules/3d/src/ap3p.cpp
@@ -1,5 +1,6 @@
 #include "precomp.hpp"
 #include "ap3p.h"
+#include "polynom_solver.h"
 
 #include <cmath>
 #include <complex>
@@ -7,67 +8,11 @@
 static inline double cbrt(double x) { return (double)cv::cubeRoot((float)x); };
 #endif
 
-namespace cv {
-
-static
-void solveQuartic(const double *factors, double *realRoots)
-{
-    const double &a4 = factors[0];
-    const double &a3 = factors[1];
-    const double &a2 = factors[2];
-    const double &a1 = factors[3];
-    const double &a0 = factors[4];
-
-    double a4_2 = a4 * a4;
-    double a3_2 = a3 * a3;
-    double a4_3 = a4_2 * a4;
-    double a2a4 = a2 * a4;
-
-    double p4 = (8 * a2a4 - 3 * a3_2) / (8 * a4_2);
-    double q4 = (a3_2 * a3 - 4 * a2a4 * a3 + 8 * a1 * a4_2) / (8 * a4_3);
-    double r4 = (256 * a0 * a4_3 - 3 * (a3_2 * a3_2) - 64 * a1 * a3 * a4_2 + 16 * a2a4 * a3_2) / (256 * (a4_3 * a4));
-
-    double p3 = ((p4 * p4) / 12 + r4) / 3; // /=-3
-    double q3 = (72 * r4 * p4 - 2 * p4 * p4 * p4 - 27 * q4 * q4) / 432; // /=2
-
-    double t; // *=2
-    std::complex<double> w;
-    if (q3 >= 0)
-        w = -std::sqrt(static_cast<std::complex<double> >(q3 * q3 - p3 * p3 * p3)) - q3;
-    else
-        w = std::sqrt(static_cast<std::complex<double> >(q3 * q3 - p3 * p3 * p3)) - q3;
-    if (w.imag() == 0.0) {
-        w.real(std::cbrt(w.real()));
-        t = 2.0 * (w.real() + p3 / w.real());
-    } else {
-        w = pow(w, 1.0 / 3);
-        t = 4.0 * w.real();
-    }
-
-    std::complex<double> sqrt_2m = sqrt(static_cast<std::complex<double> >(-2 * p4 / 3 + t));
-    double B_4A = -a3 / (4 * a4);
-    double complex1 = 4 * p4 / 3 + t;
-#if defined(__clang__) && defined(__arm__) && (__clang_major__ == 3 || __clang_major__ == 4) && !defined(__ANDROID__)
-    // details: https://github.com/opencv/opencv/issues/11135
-    // details: https://github.com/opencv/opencv/issues/11056
-    std::complex<double> complex2 = 2 * q4;
-    complex2 = std::complex<double>(complex2.real() / sqrt_2m.real(), 0);
-#else
-    std::complex<double> complex2 = 2 * q4 / sqrt_2m;
-#endif
-    double sqrt_2m_rh = sqrt_2m.real() / 2;
-    double sqrt1 = sqrt(-(complex1 + complex2)).real() / 2;
-    realRoots[0] = B_4A + sqrt_2m_rh + sqrt1;
-    realRoots[1] = B_4A + sqrt_2m_rh - sqrt1;
-    double sqrt2 = sqrt(-(complex1 - complex2)).real() / 2;
-    realRoots[2] = B_4A - sqrt_2m_rh + sqrt2;
-    realRoots[3] = B_4A - sqrt_2m_rh - sqrt2;
-}
-
-static void polishQuarticRoots(const double *coeffs, double *roots) {
+namespace {
+void polishQuarticRoots(const double *coeffs, double *roots, int nb_roots) {
     const int iterations = 2;
     for (int i = 0; i < iterations; ++i) {
-        for (int j = 0; j < 4; ++j) {
+        for (int j = 0; j < nb_roots; ++j) {
             double error =
                     (((coeffs[0] * roots[j] + coeffs[1]) * roots[j] + coeffs[2]) * roots[j] + coeffs[3]) * roots[j] +
                     coeffs[4];
@@ -124,7 +69,9 @@ inline void mat_mult(const double a[3][3], const double b[3][3], double result[3
     result[2][1] = a[2][0] * b[0][1] + a[2][1] * b[1][1] + a[2][2] * b[2][1];
     result[2][2] = a[2][0] * b[0][2] + a[2][1] * b[1][2] + a[2][2] * b[2][2];
 }
+}
 
+namespace cv {
 void ap3p::init_inverse_parameters() {
     inv_fx = 1. / fx;
     inv_fy = 1. / fy;
@@ -228,8 +175,9 @@ int ap3p::computePoses(const double featureVectors[3][4],
                         2 * (g6 * g7 - g1 * g2 - g3 * g4),
                         g7 * g7 - g2 * g2 - g4 * g4};
     double s[4];
-    solveQuartic(coeffs, s);
-    polishQuarticRoots(coeffs, s);
+    int nb_roots = solve_deg4(coeffs[0], coeffs[1], coeffs[2], coeffs[3], coeffs[4],
+                              s[0], s[1], s[2], s[3]);
+    polishQuarticRoots(coeffs, s, nb_roots);
 
     double temp[3];
     vect_cross(k1, nl, temp);
@@ -255,7 +203,7 @@ int ap3p::computePoses(const double featureVectors[3][4],
     double reproj_errors[4];
 
     int nb_solutions = 0;
-    for (int i = 0; i < 4; ++i) {
+    for (int i = 0; i < nb_roots; ++i) {
         double ctheta1p = s[i];
         if (abs(ctheta1p) > 1)
             continue;
diff --git a/modules/3d/src/usac/essential_solver.cpp b/modules/3d/src/usac/essential_solver.cpp
index 504fec6ab5..434db6d373 100644
--- a/modules/3d/src/usac/essential_solver.cpp
+++ b/modules/3d/src/usac/essential_solver.cpp
@@ -239,7 +239,8 @@ public:
             // (5) Compute the left eigenvectors of the action matrix
             Eigen::EigenSolver<Eigen::Matrix<double, 10, 10>> eigensolver(action_mat_eig);
             const Eigen::VectorXcd &eigenvalues = eigensolver.eigenvalues();
-            const auto * const eig_vecs_ = (double *) eigensolver.eigenvectors().real().data();
+            const Eigen::MatrixXcd eigenvectors = eigensolver.eigenvectors();
+            const auto * const eig_vecs_ = (double *) eigenvectors.data();
 #else
             Matx<double, 10, 10> A = constraint_mat.colRange(0, 10),
                              B = constraint_mat.colRange(10, 20), eliminated_mat;
diff --git a/modules/3d/test/test_affine2d_estimator.cpp b/modules/3d/test/test_affine2d_estimator.cpp
index 95f1235105..2282dc3240 100644
--- a/modules/3d/test/test_affine2d_estimator.cpp
+++ b/modules/3d/test/test_affine2d_estimator.cpp
@@ -115,8 +115,8 @@ TEST_P(EstimateAffine2D, testNPoints)
 
         EXPECT_NEAR(0., cvtest::norm(aff_est, aff, NORM_INF), 1e-4);
 
-        bool inliers_good = count(inliers.begin(), inliers.end(), 1) == m &&
-            m == accumulate(inliers.begin(), inliers.begin() + m, 0);
+        bool inliers_good = std::count(inliers.begin(), inliers.end(), 1) == m &&
+            m == std::accumulate(inliers.begin(), inliers.begin() + m, 0);
 
         EXPECT_TRUE(inliers_good);
     }
diff --git a/modules/3d/test/test_affine3d_estimator.cpp b/modules/3d/test/test_affine3d_estimator.cpp
index f5a118da5d..c355605385 100644
--- a/modules/3d/test/test_affine3d_estimator.cpp
+++ b/modules/3d/test/test_affine3d_estimator.cpp
@@ -161,8 +161,8 @@ bool CV_Affine3D_EstTest::testNPoints()
         return false;
     }
 
-    bool outl_good = count(outl.begin(), outl.end(), 1) == m &&
-        m == accumulate(outl.begin(), outl.begin() + m, 0);
+    bool outl_good = std::count(outl.begin(), outl.end(), 1) == m &&
+        m == std::accumulate(outl.begin(), outl.begin() + m, 0);
 
     if (!outl_good)
     {
diff --git a/modules/3d/test/test_affine_partial2d_estimator.cpp b/modules/3d/test/test_affine_partial2d_estimator.cpp
index 0be25ee7eb..dbbb4da0d9 100644
--- a/modules/3d/test/test_affine_partial2d_estimator.cpp
+++ b/modules/3d/test/test_affine_partial2d_estimator.cpp
@@ -125,8 +125,8 @@ TEST_P(EstimateAffinePartial2D, testNPoints)
 
         EXPECT_NEAR(0., cvtest::norm(aff_est, aff, NORM_INF), 1e-4);
 
-        bool inliers_good = count(inliers.begin(), inliers.end(), 1) == m &&
-            m == accumulate(inliers.begin(), inliers.begin() + m, 0);
+        bool inliers_good = std::count(inliers.begin(), inliers.end(), 1) == m &&
+            m == std::accumulate(inliers.begin(), inliers.begin() + m, 0);
 
         EXPECT_TRUE(inliers_good);
     }
diff --git a/modules/3d/test/test_solvepnp_ransac.cpp b/modules/3d/test/test_solvepnp_ransac.cpp
index be6f1342a7..b22dff15a9 100644
--- a/modules/3d/test/test_solvepnp_ransac.cpp
+++ b/modules/3d/test/test_solvepnp_ransac.cpp
@@ -41,6 +41,7 @@
 //M*/
 
 #include "test_precomp.hpp"
+#include "opencv2/core/utils/logger.hpp"
 
 namespace opencv_test { namespace {
 
@@ -2259,4 +2260,65 @@ TEST(Calib3d_SolvePnP, inputShape)
     }
 }
 
+bool hasNan(const cv::Mat& mat)
+{
+    bool has = false;
+    if (mat.type() == CV_32F)
+    {
+        for(int i = 0; i < static_cast<int>(mat.total()); i++)
+            has |= cvIsNaN(mat.at<float>(i)) != 0;
+    }
+    else if (mat.type() == CV_64F)
+    {
+        for(int i = 0; i < static_cast<int>(mat.total()); i++)
+            has |= cvIsNaN(mat.at<double>(i)) != 0;
+    }
+    else
+    {
+        has = true;
+        CV_LOG_ERROR(NULL, "check hasNan called with unsupported type!");
+    }
+
+    return has;
+}
+
+TEST(AP3P, ctheta1p_nan_23607)
+{
+    // the task is not well defined and may not converge (empty R, t) or should
+    // converge to some non-NaN solution
+    const std::array<cv::Point2d, 3> cameraPts = {
+        cv::Point2d{0.042784865945577621, 0.59844839572906494},
+        cv::Point2d{-0.028428621590137482, 0.60354739427566528},
+        cv::Point2d{0.0046037044376134872, 0.70674681663513184}
+    };
+    const std::array<cv::Point3d, 3> modelPts = {
+        cv::Point3d{-0.043258000165224075, 0.020459245890378952, -0.0069921980611979961},
+        cv::Point3d{-0.045648999512195587, 0.0029820732306689024, 0.0079000638797879219},
+        cv::Point3d{-0.043276999145746231, -0.013622495345771313, 0.0080113131552934647}
+    };
+
+    std::vector<Mat> R, t;
+    solveP3P(modelPts, cameraPts, Mat::eye(3, 3, CV_64F), Mat(), R, t, SOLVEPNP_AP3P);
+
+    EXPECT_EQ(R.size(), 2ul);
+    EXPECT_EQ(t.size(), 2ul);
+
+    // Try apply rvec and tvec to get model points from camera points.
+    Mat pts = Mat(modelPts).reshape(1, 3);
+    Mat expected = Mat(cameraPts).reshape(1, 3);
+    for (size_t i = 0; i < R.size(); ++i) {
+        EXPECT_TRUE(!hasNan(R[i]));
+        EXPECT_TRUE(!hasNan(t[i]));
+
+        Mat transform;
+        cv::Rodrigues(R[i], transform);
+        Mat res = pts * transform.t();
+        for (int j = 0; j < 3; ++j) {
+            res.row(j) += t[i].reshape(1, 1);
+            res.row(j) /= res.row(j).at<double>(2);
+        }
+        EXPECT_LE(cvtest::norm(res.colRange(0, 2), expected, NORM_INF), 3e-16);
+    }
+}
+
 }} // namespace
diff --git a/modules/3d/test/test_translation3d_estimator.cpp b/modules/3d/test/test_translation3d_estimator.cpp
index 88ad40e0f8..97c20e5033 100644
--- a/modules/3d/test/test_translation3d_estimator.cpp
+++ b/modules/3d/test/test_translation3d_estimator.cpp
@@ -91,8 +91,8 @@ TEST(Calib3d_EstimateTranslation3D, testNPoints)
         << "aff est: " << trans_est << endl
         << "aff ref: " << trans;
 
-    bool outl_good = count(outl.begin(), outl.end(), 1) == m &&
-        m == accumulate(outl.begin(), outl.begin() + m, 0);
+    bool outl_good = std::count(outl.begin(), outl.end(), 1) == m &&
+        m == std::accumulate(outl.begin(), outl.begin() + m, 0);
 
     EXPECT_TRUE(outl_good);
 }
diff --git a/modules/core/CMakeLists.txt b/modules/core/CMakeLists.txt
index 1b3f574275..ba5b61ef5f 100644
--- a/modules/core/CMakeLists.txt
+++ b/modules/core/CMakeLists.txt
@@ -60,6 +60,26 @@ if(CV_TRACE AND HAVE_ITT)
   add_definitions(-DOPENCV_WITH_ITT=1)
 endif()
 
+# https://github.com/opencv/opencv/issues/24145
+if(HAVE_IPP)
+  OCV_OPTION(OPENCV_IPP_ENABLE_ALL "Enable all OPENCV_IPP_ options at once" OFF)
+  OCV_OPTION(OPENCV_IPP_MEAN   "Enable IPP optimizations for mean (+200Kb in binary size)"                OPENCV_IPP_ENABLE_ALL)
+  OCV_OPTION(OPENCV_IPP_MINMAX "Enable IPP optimizations for minMaxLoc/minMaxIdx (+200Kb in binary size)" OPENCV_IPP_ENABLE_ALL)
+  OCV_OPTION(OPENCV_IPP_SUM    "Enable IPP optimizations for sum (+100Kb in binary size)"                 OPENCV_IPP_ENABLE_ALL)
+
+  if(OPENCV_IPP_MEAN)
+    ocv_append_source_file_compile_definitions(${CMAKE_CURRENT_SOURCE_DIR}/src/mean.dispatch.cpp "OPENCV_IPP_MEAN=1")
+  endif()
+
+  if(OPENCV_IPP_MINMAX)
+    ocv_append_source_file_compile_definitions(${CMAKE_CURRENT_SOURCE_DIR}/src/minmax.cpp "OPENCV_IPP_MINMAX=1")
+  endif()
+
+  if(OPENCV_IPP_SUM)
+    ocv_append_source_file_compile_definitions(${CMAKE_CURRENT_SOURCE_DIR}/src/sum.dispatch.cpp "OPENCV_IPP_SUM=1")
+  endif()
+endif()
+
 file(GLOB lib_cuda_hdrs
     "${CMAKE_CURRENT_LIST_DIR}/include/opencv2/${name}/cuda/*.hpp"
     "${CMAKE_CURRENT_LIST_DIR}/include/opencv2/${name}/cuda/*.h")
diff --git a/modules/core/include/opencv2/core.hpp b/modules/core/include/opencv2/core.hpp
index 96cf00a50d..89046d7907 100644
--- a/modules/core/include/opencv2/core.hpp
+++ b/modules/core/include/opencv2/core.hpp
@@ -1118,6 +1118,13 @@ CV_EXPORTS_W void flip(InputArray src, OutputArray dst, int flipCode);
  */
 CV_EXPORTS_W void flipND(InputArray src, OutputArray dst, int axis);
 
+/** @brief Broadcast the given Mat to the given shape.
+ * @param src input array
+ * @param shape target shape. Should be a list of CV_32S numbers. Note that negative values are not supported.
+ * @param dst output array that has the given shape
+ */
+CV_EXPORTS_W void broadcast(InputArray src, InputArray shape, OutputArray dst);
+
 enum RotateFlags {
     ROTATE_90_CLOCKWISE = 0, //!<Rotate 90 degrees clockwise
     ROTATE_180 = 1, //!<Rotate 180 degrees clockwise
diff --git a/modules/core/include/opencv2/core/cuda.hpp b/modules/core/include/opencv2/core/cuda.hpp
index 5dca06df98..9d210ed7b5 100644
--- a/modules/core/include/opencv2/core/cuda.hpp
+++ b/modules/core/include/opencv2/core/cuda.hpp
@@ -198,16 +198,32 @@ public:
     CV_WRAP GpuMat clone() const;
 
     //! copies the GpuMat content to device memory (Blocking call)
-    CV_WRAP void copyTo(OutputArray dst) const;
+    void copyTo(OutputArray dst) const;
+    //! bindings overload which copies the GpuMat content to device memory (Blocking call)
+    CV_WRAP void copyTo(CV_OUT GpuMat& dst) const {
+        copyTo(static_cast<OutputArray>(dst));
+    }
 
     //! copies the GpuMat content to device memory (Non-Blocking call)
-    CV_WRAP void copyTo(OutputArray dst, Stream& stream) const;
+    void copyTo(OutputArray dst, Stream& stream) const;
+    //! bindings overload which copies the GpuMat content to device memory (Non-Blocking call)
+    CV_WRAP void copyTo(CV_OUT GpuMat& dst, Stream& stream) const {
+        copyTo(static_cast<OutputArray>(dst), stream);
+    }
 
     //! copies those GpuMat elements to "m" that are marked with non-zero mask elements (Blocking call)
-    CV_WRAP void copyTo(OutputArray dst, InputArray mask) const;
+    void copyTo(OutputArray dst, InputArray mask) const;
+    //! bindings overload which copies those GpuMat elements to "m" that are marked with non-zero mask elements (Blocking call)
+    CV_WRAP void copyTo(CV_OUT GpuMat& dst, GpuMat& mask) const {
+        copyTo(static_cast<OutputArray>(dst), static_cast<InputArray>(mask));
+    }
 
     //! copies those GpuMat elements to "m" that are marked with non-zero mask elements (Non-Blocking call)
-    CV_WRAP void copyTo(OutputArray dst, InputArray mask, Stream& stream) const;
+    void copyTo(OutputArray dst, InputArray mask, Stream& stream) const;
+    //! bindings overload which copies those GpuMat elements to "m" that are marked with non-zero mask elements (Non-Blocking call)
+    CV_WRAP void copyTo(CV_OUT GpuMat& dst, GpuMat& mask, Stream& stream) const {
+        copyTo(static_cast<OutputArray>(dst), static_cast<InputArray>(mask), stream);
+    }
 
     //! sets some of the GpuMat elements to s (Blocking call)
     CV_WRAP GpuMat& setTo(Scalar s);
@@ -222,19 +238,31 @@ public:
     CV_WRAP GpuMat& setTo(Scalar s, InputArray mask, Stream& stream);
 
     //! converts GpuMat to another datatype (Blocking call)
-    CV_WRAP void convertTo(OutputArray dst, int rtype) const;
+    void convertTo(OutputArray dst, int rtype) const;
 
     //! converts GpuMat to another datatype (Non-Blocking call)
-    CV_WRAP void convertTo(OutputArray dst, int rtype, Stream& stream) const;
+    void convertTo(OutputArray dst, int rtype, Stream& stream) const;
+    //! bindings overload which converts GpuMat to another datatype (Non-Blocking call)
+    CV_WRAP void convertTo(CV_OUT GpuMat& dst, int rtype, Stream& stream) const {
+        convertTo(static_cast<OutputArray>(dst), rtype, stream);
+    }
 
     //! converts GpuMat to another datatype with scaling (Blocking call)
-    CV_WRAP void convertTo(OutputArray dst, int rtype, double alpha, double beta = 0.0) const;
+    void convertTo(OutputArray dst, int rtype, double alpha, double beta = 0.0) const;
+    //! bindings overload which converts GpuMat to another datatype with scaling(Blocking call)
+    CV_WRAP void convertTo(CV_OUT GpuMat& dst, int rtype, double alpha = 1.0, double beta = 0.0) const {
+        convertTo(static_cast<OutputArray>(dst), rtype, alpha, beta);
+    }
 
     //! converts GpuMat to another datatype with scaling (Non-Blocking call)
-    CV_WRAP void convertTo(OutputArray dst, int rtype, double alpha, Stream& stream) const;
+    void convertTo(OutputArray dst, int rtype, double alpha, Stream& stream) const;
 
     //! converts GpuMat to another datatype with scaling (Non-Blocking call)
-    CV_WRAP void convertTo(OutputArray dst, int rtype, double alpha, double beta, Stream& stream) const;
+    void convertTo(OutputArray dst, int rtype, double alpha, double beta, Stream& stream) const;
+    //! bindings overload which converts GpuMat to another datatype with scaling (Non-Blocking call)
+    CV_WRAP void convertTo(CV_OUT GpuMat& dst, int rtype, double alpha, double beta, Stream& stream) const {
+        convertTo(static_cast<OutputArray>(dst), rtype, alpha, beta, stream);
+    }
 
     CV_WRAP void assignTo(GpuMat& m, int type = -1) const;
 
diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
index 5f8c9afbe3..2ae64ca8e4 100644
--- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
@@ -2014,12 +2014,12 @@ inline v_int32x4 v_trunc(const v_float32x4& a)
 inline v_int32x4 v_round(const v_float64x2& a)
 {
     static const int32x2_t zero = vdup_n_s32(0);
-    return v_int32x4(vcombine_s32(vmovn_s64(vcvtaq_s64_f64(a.val)), zero));
+    return v_int32x4(vcombine_s32(vmovn_s64(vcvtnq_s64_f64(a.val)), zero));
 }
 
 inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
 {
-    return v_int32x4(vcombine_s32(vmovn_s64(vcvtaq_s64_f64(a.val)), vmovn_s64(vcvtaq_s64_f64(b.val))));
+    return v_int32x4(vcombine_s32(vmovn_s64(vcvtnq_s64_f64(a.val)), vmovn_s64(vcvtnq_s64_f64(b.val))));
 }
 
 inline v_int32x4 v_floor(const v_float64x2& a)
diff --git a/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp b/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp
index dab82489f8..6c28b44f5b 100644
--- a/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp
@@ -924,6 +924,9 @@ inline scalartype v_reduce_sum(const _Tpvec& a)  \
     return (scalartype)v_get0(res); \
 }
 OPENCV_HAL_IMPL_RVV_REDUCE_SUM_FP(v_float32, v_float32, vfloat32m1_t, float, f32, VTraits<v_float32>::vlanes())
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_REDUCE_SUM_FP(v_float64, v_float64, vfloat64m1_t, float, f64, VTraits<v_float64>::vlanes())
+#endif
 
 #define OPENCV_HAL_IMPL_RVV_REDUCE(_Tpvec, func, scalartype, suffix, vl, red) \
 inline scalartype v_reduce_##func(const _Tpvec& a)  \
diff --git a/modules/core/include/opencv2/core/opencl/opencl_info.hpp b/modules/core/include/opencv2/core/opencl/opencl_info.hpp
index 3ead76e5c4..0f0de893ca 100644
--- a/modules/core/include/opencv2/core/opencl/opencl_info.hpp
+++ b/modules/core/include/opencv2/core/opencl/opencl_info.hpp
@@ -3,6 +3,7 @@
 // of this distribution and at http://opencv.org/license.html.
 
 #include <iostream>
+#include <sstream>
 
 #include <opencv2/core.hpp>
 #include <opencv2/core/ocl.hpp>
diff --git a/modules/core/include/opencv2/core/utils/filesystem.private.hpp b/modules/core/include/opencv2/core/utils/filesystem.private.hpp
index c32be15c61..70df64f0d4 100644
--- a/modules/core/include/opencv2/core/utils/filesystem.private.hpp
+++ b/modules/core/include/opencv2/core/utils/filesystem.private.hpp
@@ -12,7 +12,8 @@
 #  elif defined WINRT || defined _WIN32_WCE
      /* not supported */
 #  elif defined __ANDROID__ || defined __linux__ || defined _WIN32 || \
-        defined __FreeBSD__ || defined __bsdi__ || defined __HAIKU__
+        defined __FreeBSD__ || defined __bsdi__ || defined __HAIKU__ || \
+        defined __GNU__
 #      define OPENCV_HAVE_FILESYSTEM_SUPPORT 1
 #  elif defined(__APPLE__)
 #    include <TargetConditionals.h>
diff --git a/modules/core/perf/perf_arithm.cpp b/modules/core/perf/perf_arithm.cpp
index 3ac9a24639..872963fc65 100644
--- a/modules/core/perf/perf_arithm.cpp
+++ b/modules/core/perf/perf_arithm.cpp
@@ -5,8 +5,35 @@ namespace opencv_test
 {
 using namespace perf;
 
+using BroadcastTest = perf::TestBaseWithParam<std::tuple<std::vector<int>, perf::MatType, std::vector<int>>>;
 typedef Size_MatType BinaryOpTest;
 
+PERF_TEST_P_(BroadcastTest, basic)
+{
+    std::vector<int> shape_src = get<0>(GetParam());
+    int dt_type = get<1>(GetParam());
+    std::vector<int> shape_dst = get<2>(GetParam());
+
+    cv::Mat src(static_cast<int>(shape_src.size()), shape_src.data(), dt_type);
+    cv::Mat dst(static_cast<int>(shape_dst.size()), shape_dst.data(), dt_type);
+
+    cv::randu(src, -1.f, 1.f);
+
+    TEST_CYCLE() cv::broadcast(src, shape_dst, dst);
+
+    SANITY_CHECK_NOTHING();
+}
+
+INSTANTIATE_TEST_CASE_P(/*nothing*/ , BroadcastTest,
+    testing::Combine(
+        testing::Values(std::vector<int>{1, 100, 800},
+                        std::vector<int>{10, 1, 800},
+                        std::vector<int>{10, 100, 1}),
+        testing::Values(CV_32FC1),
+        testing::Values(std::vector<int>{10, 100, 800})
+    )
+);
+
 PERF_TEST_P_(BinaryOpTest, min)
 {
     Size sz = get<0>(GetParam());
diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp
index 67cc051e0b..9f85ea5f04 100644
--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
@@ -1335,7 +1335,7 @@ struct InRange_SIMD
     }
 };
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 
 template <>
 struct InRange_SIMD<uchar>
@@ -1344,7 +1344,7 @@ struct InRange_SIMD<uchar>
         uchar * dst, int len) const
     {
         int x = 0;
-        const int width = v_uint8::nlanes;
+        const int width = VTraits<v_uint8>::vlanes();
 
         for (; x <= len - width; x += width)
         {
@@ -1352,7 +1352,7 @@ struct InRange_SIMD<uchar>
             v_uint8 low = vx_load(src2 + x);
             v_uint8 high = vx_load(src3 + x);
 
-            v_store(dst + x, (values >= low) & (high >= values));
+            v_store(dst + x, v_and(v_ge(values, low), v_ge(high, values)));
         }
         vx_cleanup();
         return x;
@@ -1366,7 +1366,7 @@ struct InRange_SIMD<schar>
         uchar * dst, int len) const
     {
         int x = 0;
-        const int width = v_int8::nlanes;
+        const int width = VTraits<v_int8>::vlanes();
 
         for (; x <= len - width; x += width)
         {
@@ -1374,7 +1374,7 @@ struct InRange_SIMD<schar>
             v_int8 low = vx_load(src2 + x);
             v_int8 high = vx_load(src3 + x);
 
-            v_store((schar*)(dst + x), (values >= low) & (high >= values));
+            v_store((schar*)(dst + x), v_and(v_ge(values, low), v_ge(high, values)));
         }
         vx_cleanup();
         return x;
@@ -1388,7 +1388,7 @@ struct InRange_SIMD<ushort>
         uchar * dst, int len) const
     {
         int x = 0;
-        const int width = v_uint16::nlanes * 2;
+        const int width = VTraits<v_uint16>::vlanes() * 2;
 
         for (; x <= len - width; x += width)
         {
@@ -1396,11 +1396,11 @@ struct InRange_SIMD<ushort>
             v_uint16 low1 = vx_load(src2 + x);
             v_uint16 high1 = vx_load(src3 + x);
 
-            v_uint16 values2 = vx_load(src1 + x + v_uint16::nlanes);
-            v_uint16 low2 = vx_load(src2 + x + v_uint16::nlanes);
-            v_uint16 high2 = vx_load(src3 + x + v_uint16::nlanes);
+            v_uint16 values2 = vx_load(src1 + x + VTraits<v_uint16>::vlanes());
+            v_uint16 low2 = vx_load(src2 + x + VTraits<v_uint16>::vlanes());
+            v_uint16 high2 = vx_load(src3 + x + VTraits<v_uint16>::vlanes());
 
-            v_store(dst + x, v_pack((values1 >= low1) & (high1 >= values1), (values2 >= low2) & (high2 >= values2)));
+            v_store(dst + x, v_pack(v_and(v_ge(values1, low1), v_ge(high1, values1)), v_and(v_ge(values2, low2), v_ge(high2, values2))));
         }
         vx_cleanup();
         return x;
@@ -1414,7 +1414,7 @@ struct InRange_SIMD<short>
         uchar * dst, int len) const
     {
         int x = 0;
-        const int width = (int)v_int16::nlanes * 2;
+        const int width = (int)VTraits<v_int16>::vlanes() * 2;
 
         for (; x <= len - width; x += width)
         {
@@ -1422,11 +1422,11 @@ struct InRange_SIMD<short>
             v_int16 low1 = vx_load(src2 + x);
             v_int16 high1 = vx_load(src3 + x);
 
-            v_int16 values2 = vx_load(src1 + x + v_int16::nlanes);
-            v_int16 low2 = vx_load(src2 + x + v_int16::nlanes);
-            v_int16 high2 = vx_load(src3 + x + v_int16::nlanes);
+            v_int16 values2 = vx_load(src1 + x + VTraits<v_int16>::vlanes());
+            v_int16 low2 = vx_load(src2 + x + VTraits<v_int16>::vlanes());
+            v_int16 high2 = vx_load(src3 + x + VTraits<v_int16>::vlanes());
 
-            v_store((schar*)(dst + x), v_pack((values1 >= low1) & (high1 >= values1), (values2 >= low2) & (high2 >= values2)));
+            v_store((schar*)(dst + x), v_pack(v_and(v_ge(values1, low1), v_ge(high1, values1)), v_and(v_ge(values2, low2), v_ge(high2, values2))));
         }
         vx_cleanup();
         return x;
@@ -1440,7 +1440,7 @@ struct InRange_SIMD<int>
         uchar * dst, int len) const
     {
         int x = 0;
-        const int width = (int)v_int32::nlanes * 2;
+        const int width = (int)VTraits<v_int32>::vlanes() * 2;
 
         for (; x <= len - width; x += width)
         {
@@ -1448,11 +1448,11 @@ struct InRange_SIMD<int>
             v_int32 low1 = vx_load(src2 + x);
             v_int32 high1 = vx_load(src3 + x);
 
-            v_int32 values2 = vx_load(src1 + x + v_int32::nlanes);
-            v_int32 low2 = vx_load(src2 + x + v_int32::nlanes);
-            v_int32 high2 = vx_load(src3 + x + v_int32::nlanes);
+            v_int32 values2 = vx_load(src1 + x + VTraits<v_int32>::vlanes());
+            v_int32 low2 = vx_load(src2 + x + VTraits<v_int32>::vlanes());
+            v_int32 high2 = vx_load(src3 + x + VTraits<v_int32>::vlanes());
 
-            v_pack_store(dst + x, v_reinterpret_as_u16(v_pack((values1 >= low1) & (high1 >= values1), (values2 >= low2) & (high2 >= values2))));
+            v_pack_store(dst + x, v_reinterpret_as_u16(v_pack(v_and(v_ge(values1, low1), v_ge(high1, values1)), v_and(v_ge(values2, low2), v_ge(high2, values2)))));
         }
         vx_cleanup();
         return x;
@@ -1466,7 +1466,7 @@ struct InRange_SIMD<float>
         uchar * dst, int len) const
     {
         int x = 0;
-        const int width = (int)v_float32::nlanes * 2;
+        const int width = (int)VTraits<v_float32>::vlanes() * 2;
 
         for (; x <= len - width; x += width)
         {
@@ -1474,12 +1474,12 @@ struct InRange_SIMD<float>
             v_float32 low1 = vx_load(src2 + x);
             v_float32 high1 = vx_load(src3 + x);
 
-            v_float32 values2 = vx_load(src1 + x + v_float32::nlanes);
-            v_float32 low2 = vx_load(src2 + x + v_float32::nlanes);
-            v_float32 high2 = vx_load(src3 + x + v_float32::nlanes);
+            v_float32 values2 = vx_load(src1 + x + VTraits<v_float32>::vlanes());
+            v_float32 low2 = vx_load(src2 + x + VTraits<v_float32>::vlanes());
+            v_float32 high2 = vx_load(src3 + x + VTraits<v_float32>::vlanes());
 
-            v_pack_store(dst + x, v_pack(v_reinterpret_as_u32(values1 >= low1) & v_reinterpret_as_u32(high1 >= values1),
-                                         v_reinterpret_as_u32(values2 >= low2) & v_reinterpret_as_u32(high2 >= values2)));
+            v_pack_store(dst + x, v_pack(v_and(v_reinterpret_as_u32(v_ge(values1, low1)), v_reinterpret_as_u32(v_ge(high1, values1))),
+                                         v_and(v_reinterpret_as_u32(v_ge(values2, low2)), v_reinterpret_as_u32(v_ge(high2, values2)))));
         }
         vx_cleanup();
         return x;
diff --git a/modules/core/src/arithm.simd.hpp b/modules/core/src/arithm.simd.hpp
index 20e70e5392..7054b3e6b6 100644
--- a/modules/core/src/arithm.simd.hpp
+++ b/modules/core/src/arithm.simd.hpp
@@ -215,7 +215,7 @@ template<typename T1, typename Tvec>
 struct op_add
 {
     static inline Tvec r(const Tvec& a, const Tvec& b)
-    { return a + b; }
+    { return v_add(a, b); }
     static inline T1 r(T1 a, T1 b)
     { return c_add(a, b); }
 };
@@ -225,7 +225,7 @@ template<typename T1, typename Tvec>
 struct op_sub
 {
     static inline Tvec r(const Tvec& a, const Tvec& b)
-    { return a - b; }
+    { return v_sub(a, b); }
     static inline T1 r(T1 a, T1 b)
     { return c_sub(a, b); }
 };
@@ -262,7 +262,7 @@ struct op_absdiff
 template<>
 struct op_absdiff<schar, v_int8>
 {
-#if CV_SIMD
+#if CV_SIMD || CV_SIMD_SCALABLE
     static inline v_int8 r(const v_int8& a, const v_int8& b)
     { return v_absdiffs(a, b); }
 #endif
@@ -272,7 +272,7 @@ struct op_absdiff<schar, v_int8>
 template<>
 struct op_absdiff<short, v_int16>
 {
-#if CV_SIMD
+#if CV_SIMD || CV_SIMD_SCALABLE
     static inline v_int16 r(const v_int16& a, const v_int16& b)
     { return v_absdiffs(a, b); }
 #endif
@@ -282,7 +282,7 @@ struct op_absdiff<short, v_int16>
 template<>
 struct op_absdiff<int, v_int32>
 {
-#if CV_SIMD
+#if CV_SIMD || CV_SIMD_SCALABLE
     static inline v_int32 r(const v_int32& a, const v_int32& b)
     { return v_reinterpret_as_s32(v_absdiff(a, b)); }
 #endif
@@ -295,7 +295,7 @@ template<typename T1, typename Tvec>
 struct op_or
 {
     static inline Tvec r(const Tvec& a, const Tvec& b)
-    { return a | b; }
+    { return v_or(a, b); }
     static inline T1 r(T1 a, T1 b)
     { return a | b; }
 };
@@ -303,7 +303,7 @@ template<typename T1, typename Tvec>
 struct op_xor
 {
     static inline Tvec r(const Tvec& a, const Tvec& b)
-    { return a ^ b; }
+    { return v_xor(a, b); }
     static inline T1 r(T1 a, T1 b)
     { return a ^ b; }
 };
@@ -311,7 +311,7 @@ template<typename T1, typename Tvec>
 struct op_and
 {
     static inline Tvec r(const Tvec& a, const Tvec& b)
-    { return a & b; }
+    { return v_and(a, b); }
     static inline T1 r(T1 a, T1 b)
     { return a & b; }
 };
@@ -320,14 +320,14 @@ struct op_not
 {
     // ignored b from loader level
     static inline Tvec r(const Tvec& a)
-    { return ~a; }
+    { return v_not(a); }
     static inline T1 r(T1 a, T1)
     { return ~a; }
 };
 
 //////////////////////////// Loaders /////////////////////////////////
 
-#if CV_SIMD
+#if CV_SIMD || CV_SIMD_SCALABLE
 
 template< template<typename T1, typename Tvec> class OP, typename T1, typename Tvec>
 struct bin_loader
@@ -392,13 +392,13 @@ template<template<typename T1, typename Tvec> class OP, typename T1, typename Tv
 static void bin_loop(const T1* src1, size_t step1, const T1* src2, size_t step2, T1* dst, size_t step, int width, int height)
 {
     typedef OP<T1, Tvec> op;
-#if CV_SIMD
+#if CV_SIMD  || CV_SIMD_SCALABLE
     typedef bin_loader<OP, T1, Tvec> ldr;
-    enum {wide_step = Tvec::nlanes};
+    const int wide_step = VTraits<Tvec>::vlanes();
     #if !CV_NEON && CV_SIMD_WIDTH == 16
-        enum {wide_step_l = wide_step * 2};
+        const int wide_step_l = wide_step * 2;
     #else
-        enum {wide_step_l = wide_step};
+        const int wide_step_l = wide_step;
     #endif
 #endif // CV_SIMD
 
@@ -410,7 +410,7 @@ static void bin_loop(const T1* src1, size_t step1, const T1* src2, size_t step2,
     {
         int x = 0;
 
-    #if CV_SIMD
+    #if CV_SIMD || CV_SIMD_SCALABLE
         #if !CV_NEON && !CV_MSA
         if (is_aligned(src1, src2, dst))
         {
@@ -583,7 +583,7 @@ template<typename T1, typename Tvec>
 struct op_cmplt
 {
     static inline Tvec r(const Tvec& a, const Tvec& b)
-    { return a < b; }
+    { return v_lt(a, b); }
     static inline uchar r(T1 a, T1 b)
     { return (uchar)-(int)(a < b); }
 };
@@ -592,7 +592,7 @@ template<typename T1, typename Tvec>
 struct op_cmple
 {
     static inline Tvec r(const Tvec& a, const Tvec& b)
-    { return a <= b; }
+    { return v_le(a, b); }
     static inline uchar r(T1 a, T1 b)
     { return (uchar)-(int)(a <= b); }
 };
@@ -601,7 +601,7 @@ template<typename T1, typename Tvec>
 struct op_cmpeq
 {
     static inline Tvec r(const Tvec& a, const Tvec& b)
-    { return a == b; }
+    { return v_eq(a, b); }
     static inline uchar r(T1 a, T1 b)
     { return (uchar)-(int)(a == b); }
 };
@@ -610,14 +610,14 @@ template<typename T1, typename Tvec>
 struct op_cmpne
 {
     static inline Tvec r(const Tvec& a, const Tvec& b)
-    { return a != b; }
+    { return v_ne(a, b); }
     static inline uchar r(T1 a, T1 b)
     { return (uchar)-(int)(a != b); }
 };
 
 //////////////////////////// Loaders /////////////////////////////////
 
-#if CV_SIMD
+#if CV_SIMD || CV_SIMD_SCALABLE
 // todo: add support for RW alignment & stream
 template<int nload, template<typename T1, typename Tvec> class OP, typename T1, typename Tvec>
 struct cmp_loader_n
@@ -642,10 +642,10 @@ template<template<typename T1, typename Tvec> class OP, typename T1, typename Tv
 struct cmp_loader_n<sizeof(ushort), OP, T1, Tvec>
 {
     typedef OP<T1, Tvec> op;
-    enum {step = Tvec::nlanes};
 
     static inline void l(const T1* src1, const T1* src2, uchar* dst)
     {
+        const int step = VTraits<Tvec>::vlanes();
         Tvec c0 = op::r(vx_load(src1), vx_load(src2));
         Tvec c1 = op::r(vx_load(src1 + step), vx_load(src2 + step));
         v_store(dst, v_pack_b(v_reinterpret_as_u16(c0), v_reinterpret_as_u16(c1)));
@@ -656,10 +656,10 @@ template<template<typename T1, typename Tvec> class OP, typename T1, typename Tv
 struct cmp_loader_n<sizeof(unsigned), OP, T1, Tvec>
 {
     typedef OP<T1, Tvec> op;
-    enum {step = Tvec::nlanes};
 
     static inline void l(const T1* src1, const T1* src2, uchar* dst)
     {
+        const int step = VTraits<Tvec>::vlanes();
         v_uint32 c0 = v_reinterpret_as_u32(op::r(vx_load(src1), vx_load(src2)));
         v_uint32 c1 = v_reinterpret_as_u32(op::r(vx_load(src1 + step), vx_load(src2 + step)));
         v_uint32 c2 = v_reinterpret_as_u32(op::r(vx_load(src1 + step * 2), vx_load(src2 + step * 2)));
@@ -672,10 +672,10 @@ template<template<typename T1, typename Tvec> class OP, typename T1, typename Tv
 struct cmp_loader_n<sizeof(double), OP, T1, Tvec>
 {
     typedef OP<T1, Tvec> op;
-    enum {step = Tvec::nlanes};
 
     static inline void l(const T1* src1, const T1* src2, uchar* dst)
     {
+        const int step = VTraits<Tvec>::vlanes();
         v_uint64 c0 = v_reinterpret_as_u64(op::r(vx_load(src1), vx_load(src2)));
         v_uint64 c1 = v_reinterpret_as_u64(op::r(vx_load(src1 + step), vx_load(src2 + step)));
         v_uint64 c2 = v_reinterpret_as_u64(op::r(vx_load(src1 + step * 2), vx_load(src2 + step * 2)));
@@ -697,9 +697,9 @@ template<template<typename T1, typename Tvec> class OP, typename T1, typename Tv
 static void cmp_loop(const T1* src1, size_t step1, const T1* src2, size_t step2, uchar* dst, size_t step, int width, int height)
 {
     typedef OP<T1, Tvec> op;
-#if CV_SIMD
+#if CV_SIMD || CV_SIMD_SCALABLE
     typedef cmp_loader_n<sizeof(T1), OP, T1, Tvec> ldr;
-    enum {wide_step = Tvec::nlanes * sizeof(T1)};
+    const int wide_step = VTraits<Tvec>::vlanes() * sizeof(T1);
 #endif // CV_SIMD
 
     step1 /= sizeof(T1);
@@ -709,7 +709,7 @@ static void cmp_loop(const T1* src1, size_t step1, const T1* src2, size_t step2,
     {
         int x = 0;
 
-    #if CV_SIMD
+    #if CV_SIMD || CV_SIMD_SCALABLE
         for (; x <= width - wide_step; x += wide_step)
         {
             ldr::l(src1 + x, src2 + x, dst + x);
@@ -876,7 +876,7 @@ DEFINE_SIMD_ALL(cmp)
 
 //////////////////////////// Loaders ///////////////////////////////
 
-#if CV_SIMD
+#if CV_SIMD || CV_SIMD_SCALABLE
 // todo: add support for RW alignment & stream
 template<int nload, template<typename T1, typename T2, typename Tvec> class OP, typename T1, typename T2, typename Tvec>
 struct scalar_loader_n
@@ -1009,10 +1009,10 @@ template<template<typename T1, typename T2, typename Tvec> class OP, typename T2
 struct scalar_loader_n<sizeof(int), OP, int, T2, v_int32>
 {
     typedef OP<int, T2, v_int32> op;
-    enum {step = v_int32::nlanes};
 
     static inline void l(const int* src1, const int* src2, const T2* scalar, int* dst)
     {
+        const int step = VTraits<v_int32>::vlanes();
         v_int32 v_src1 = vx_load(src1);
         v_int32 v_src2 = vx_load(src2);
         v_int32 v_src1s = vx_load(src1 + step);
@@ -1039,6 +1039,7 @@ struct scalar_loader_n<sizeof(int), OP, int, T2, v_int32>
 
     static inline void l(const int* src1, const T2* scalar, int* dst)
     {
+        const int step = VTraits<v_int32>::vlanes();
         v_int32 v_src1 = vx_load(src1);
         v_int32 v_src1s = vx_load(src1 + step);
 
@@ -1064,10 +1065,9 @@ template<template<typename T1, typename T2, typename Tvec> class OP, typename T2
 struct scalar_loader_n<sizeof(float), OP, float, T2, v_float32>
 {
     typedef OP<float, T2, v_float32> op;
-    enum {step = v_float32::nlanes};
-
     static inline void l(const float* src1, const float* src2, const T2* scalar, float* dst)
     {
+        const int step = VTraits<v_float32>::vlanes();
         v_float32 v_src1 = vx_load(src1);
         v_float32 v_src2 = vx_load(src2);
         v_float32 v_src1s = vx_load(src1 + step);
@@ -1082,6 +1082,7 @@ struct scalar_loader_n<sizeof(float), OP, float, T2, v_float32>
 
     static inline void l(const float* src1, const T2* scalar, float* dst)
     {
+        const int step = VTraits<v_float32>::vlanes();
         v_float32 v_src1 = vx_load(src1);
         v_float32 v_src1s = vx_load(src1 + step);
 
@@ -1258,10 +1259,10 @@ static void scalar_loop(const T1* src1, size_t step1, const T1* src2, size_t ste
                  T1* dst, size_t step, int width, int height, const T2* scalar)
 {
     typedef OP<T1, T2, Tvec> op;
-#if CV_SIMD
+#if CV_SIMD || CV_SIMD_SCALABLE
     typedef scalar_loader_n<sizeof(T1), OP, T1, T2, Tvec> ldr;
-    const int wide_step = sizeof(T1) > sizeof(ushort) ? Tvec::nlanes * 2 :
-                          sizeof(T1) == sizeof(uchar) ? Tvec::nlanes / 2 : Tvec::nlanes;
+    const int wide_step = sizeof(T1) > sizeof(ushort) ? VTraits<Tvec>::vlanes() * 2 :
+                          sizeof(T1) == sizeof(uchar) ? VTraits<Tvec>::vlanes() / 2 : VTraits<Tvec>::vlanes();
 #endif // CV_SIMD
 
     step1 /= sizeof(T1);
@@ -1272,7 +1273,7 @@ static void scalar_loop(const T1* src1, size_t step1, const T1* src2, size_t ste
     {
         int x = 0;
 
-    #if CV_SIMD
+    #if CV_SIMD || CV_SIMD_SCALABLE
         for (; x <= width - wide_step; x += wide_step)
         {
             ldr::l(src1 + x, src2 + x, scalar, dst + x);
@@ -1304,10 +1305,10 @@ template<template<typename T1, typename T2, typename Tvec> class OP, typename T1
 static void scalar_loop(const T1* src1, size_t step1, T1* dst, size_t step, int width, int height, const T2* scalar)
 {
     typedef OP<T1, T2, Tvec> op;
-#if CV_SIMD
+#if CV_SIMD || CV_SIMD_SCALABLE
     typedef scalar_loader_n<sizeof(T1), OP, T1, T2, Tvec> ldr;
-    const int wide_step = sizeof(T1) > sizeof(ushort) ? Tvec::nlanes * 2 :
-                          sizeof(T1) == sizeof(uchar) ? Tvec::nlanes / 2 : Tvec::nlanes;
+    const int wide_step = sizeof(T1) > sizeof(ushort) ? VTraits<Tvec>::vlanes() * 2 :
+                          sizeof(T1) == sizeof(uchar) ? VTraits<Tvec>::vlanes() / 2 : VTraits<Tvec>::vlanes();
 #endif // CV_SIMD
 
     step1 /= sizeof(T1);
@@ -1317,7 +1318,7 @@ static void scalar_loop(const T1* src1, size_t step1, T1* dst, size_t step, int
     {
         int x = 0;
 
-    #if CV_SIMD
+    #if CV_SIMD || CV_SIMD_SCALABLE
         for (; x <= width - wide_step; x += wide_step)
         {
             ldr::l(src1 + x, scalar, dst + x);
@@ -1424,7 +1425,7 @@ template<typename T1, typename Tvec>
 struct op_mul
 {
     static inline Tvec r(const Tvec& a, const Tvec& b)
-    { return a * b; }
+    { return v_mul(a, b); }
     static inline T1 r(T1 a, T1 b)
     { return saturate_cast<T1>(a * b); }
 };
@@ -1432,11 +1433,11 @@ struct op_mul
 template<typename T1, typename T2, typename Tvec>
 struct op_mul_scale
 {
-#if CV_SIMD
+#if CV_SIMD || CV_SIMD_SCALABLE
     static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar)
     {
         const v_float32 v_scalar = vx_setall_f32(*scalar);
-        return v_scalar * a * b;
+        return v_mul(v_scalar , a , b);
     }
 #endif
     static inline T1 r(T1 a, T1 b, const T2* scalar)
@@ -1452,7 +1453,7 @@ struct op_mul_scale<double, double, v_float64>
     static inline v_float64 r(const v_float64& a, const v_float64& b, const double* scalar)
     {
         const v_float64 v_scalar = vx_setall_f64(*scalar);
-        return v_scalar * a * b;
+        return v_mul(v_mul(v_scalar, a), b);
     }
 #endif
     static inline double r(double a, double b, const double* scalar)
@@ -1565,7 +1566,7 @@ template<typename T1, typename Tvec>
 struct op_div_f
 {
     static inline Tvec r(const Tvec& a, const Tvec& b)
-    { return a / b; }
+    { return v_div(a, b); }
     static inline T1 r(T1 a, T1 b)
     { return a / b; }
 };
@@ -1573,16 +1574,16 @@ struct op_div_f
 template<typename T1, typename T2, typename Tvec>
 struct op_div_scale
 {
-#if CV_SIMD
+#if CV_SIMD || CV_SIMD_SCALABLE
     static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar)
     {
         const v_float32 v_scalar = vx_setall_f32(*scalar);
-        return a * v_scalar / b;
+        return v_div(v_mul(a, v_scalar), b);
     }
     static inline Tvec pre(const Tvec& denom, const Tvec& res)
     {
-        const Tvec v_zero = vx_setall<typename Tvec::lane_type>(0);
-        return v_select(denom == v_zero, v_zero, res);
+        const Tvec v_zero = vx_setall<typename VTraits<Tvec>::lane_type>(0);
+        return v_select(v_eq(denom, v_zero), v_zero, res);
     }
 #endif
     static inline T1 r(T1 a, T1 denom, const T2* scalar)
@@ -1595,11 +1596,11 @@ struct op_div_scale
 template<>
 struct op_div_scale<float, float, v_float32>
 {
-#if CV_SIMD
+#if CV_SIMD || CV_SIMD_SCALABLE
     static inline v_float32 r(const v_float32& a, const v_float32& b, const float* scalar)
     {
         const v_float32 v_scalar = vx_setall_f32(*scalar);
-        return a * v_scalar / b;
+        return v_div(v_mul(a, v_scalar), b);
     }
 #endif
     static inline float r(float a, float denom, const float* scalar)
@@ -1613,7 +1614,7 @@ struct op_div_scale<double, double, v_float64>
     static inline v_float64 r(const v_float64& a, const v_float64& b, const double* scalar)
     {
         const v_float64 v_scalar = vx_setall_f64(*scalar);
-        return a * v_scalar / b;
+        return v_div(v_mul(a, v_scalar), b);
     }
 #endif
     static inline double r(double a, double denom, const double* scalar)
@@ -1681,7 +1682,7 @@ DEFINE_SIMD_ALL(div, div_loop)
 template<typename T1, typename T2, typename Tvec>
 struct op_add_scale
 {
-#if CV_SIMD
+#if CV_SIMD || CV_SIMD_SCALABLE
     static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalar)
     {
         const v_float32 v_alpha = vx_setall_f32(*scalar);
@@ -1714,7 +1715,7 @@ struct op_add_scale<double, double, v_float64>
 template<typename T1, typename T2, typename Tvec>
 struct op_add_weighted
 {
-#if CV_SIMD
+#if CV_SIMD || CV_SIMD_SCALABLE
     static inline v_float32 r(const v_float32& a, const v_float32& b, const T2* scalars)
     {
         const v_float32 v_alpha = vx_setall_f32(scalars[0]);
@@ -1831,16 +1832,16 @@ DEFINE_SIMD_F64(addWeighted, add_weighted_loop_d)
 template<typename T1, typename T2, typename Tvec>
 struct op_recip
 {
-#if CV_SIMD
+#if CV_SIMD || CV_SIMD_SCALABLE
     static inline v_float32 r(const v_float32& a, const T2* scalar)
     {
         const v_float32 v_scalar = vx_setall_f32(*scalar);
-        return v_scalar / a;
+        return v_div(v_scalar, a);
     }
     static inline Tvec pre(const Tvec& denom, const Tvec& res)
     {
-        const Tvec v_zero = vx_setall<typename Tvec::lane_type>(0);
-        return v_select(denom == v_zero, v_zero, res);
+        const Tvec v_zero = vx_setall<typename VTraits<Tvec>::lane_type>(0);
+        return v_select(v_eq(denom, v_zero), v_zero, res);
     }
 #endif
     static inline T1 r(T1 denom, const T2* scalar)
@@ -1853,11 +1854,11 @@ struct op_recip
 template<>
 struct op_recip<float, float, v_float32>
 {
-#if CV_SIMD
+#if CV_SIMD || CV_SIMD_SCALABLE
     static inline v_float32 r(const v_float32& a, const float* scalar)
     {
         const v_float32 v_scalar = vx_setall_f32(*scalar);
-        return v_scalar / a;
+        return v_div(v_scalar, a);
     }
 #endif
     static inline float r(float denom, const float* scalar)
@@ -1871,7 +1872,7 @@ struct op_recip<double, double, v_float64>
     static inline v_float64 r(const v_float64& a, const double* scalar)
     {
         const v_float64 v_scalar = vx_setall_f64(*scalar);
-        return v_scalar / a;
+        return v_div(v_scalar, a);
     }
 #endif
     static inline double r(double denom, const double* scalar)
diff --git a/modules/core/src/check.cpp b/modules/core/src/check.cpp
index ffd9b302bf..2891f3a2e3 100644
--- a/modules/core/src/check.cpp
+++ b/modules/core/src/check.cpp
@@ -4,6 +4,8 @@
 
 #include "precomp.hpp"
 
+#include <sstream>
+
 #include "opencv2/core/check.hpp"
 
 namespace cv {
diff --git a/modules/core/src/convert.hpp b/modules/core/src/convert.hpp
index 3aa7dadac9..9f8e5643d5 100644
--- a/modules/core/src/convert.hpp
+++ b/modules/core/src/convert.hpp
@@ -11,7 +11,7 @@
 namespace cv
 {
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 
 static inline void vx_load_as(const uchar* ptr, v_float32& a)
 { a = v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(ptr))); }
@@ -78,7 +78,7 @@ static inline void v_store_as(int64_t* ptr, const v_float32& a)
     v_int64 ia_0, ia_1;
     v_expand(ia, ia_0, ia_1);
     v_store(ptr, ia_0);
-    v_store(ptr + v_int64::nlanes, ia_1);
+    v_store(ptr + VTraits<v_uint64>::vlanes(), ia_1);
 }
 
 static inline void v_store_as(uint64_t* ptr, const v_float32& a)
@@ -88,7 +88,7 @@ static inline void v_store_as(uint64_t* ptr, const v_float32& a)
     ia = v_max(ia, vx_setzero_s32());
     v_expand(v_reinterpret_as_u32(ia), ia_0, ia_1);
     v_store(ptr, ia_0);
-    v_store(ptr + v_int64::nlanes, ia_1);
+    v_store(ptr + VTraits<v_uint64>::vlanes(), ia_1);
 }
 
 static inline void vx_load_pair_as(const uchar* ptr, v_uint16& a, v_uint16& b)
@@ -104,7 +104,7 @@ static inline void vx_load_pair_as(const schar* ptr, v_uint16& a, v_uint16& b)
 }
 
 static inline void vx_load_pair_as(const ushort* ptr, v_uint16& a, v_uint16& b)
-{ a = vx_load(ptr); b = vx_load(ptr + v_uint16::nlanes); }
+{ a = vx_load(ptr); b = vx_load(ptr + VTraits<v_uint16>::vlanes()); }
 
 static inline void vx_load_pair_as(const uchar* ptr, v_int16& a, v_int16& b)
 {
@@ -118,7 +118,7 @@ static inline void vx_load_pair_as(const schar* ptr, v_int16& a, v_int16& b)
 { v_expand(vx_load(ptr), a, b); }
 
 static inline void vx_load_pair_as(const short* ptr, v_int16& a, v_int16& b)
-{ a = vx_load(ptr); b = vx_load(ptr + v_uint16::nlanes); }
+{ a = vx_load(ptr); b = vx_load(ptr + VTraits<v_uint16>::vlanes()); }
 
 static inline void vx_load_pair_as(const uchar* ptr, v_int32& a, v_int32& b)
 {
@@ -147,7 +147,7 @@ static inline void vx_load_pair_as(const short* ptr, v_int32& a, v_int32& b)
 static inline void vx_load_pair_as(const int* ptr, v_int32& a, v_int32& b)
 {
     a = vx_load(ptr);
-    b = vx_load(ptr + v_int32::nlanes);
+    b = vx_load(ptr + VTraits<v_int32>::vlanes());
 }
 
 static inline void vx_load_pair_as(const uchar* ptr, v_float32& a, v_float32& b)
@@ -184,14 +184,14 @@ static inline void vx_load_pair_as(const short* ptr, v_float32& a, v_float32& b)
 
 static inline void vx_load_pair_as(const int* ptr, v_float32& a, v_float32& b)
 {
-    v_int32 ia = vx_load(ptr), ib = vx_load(ptr + v_int32::nlanes);
+    v_int32 ia = vx_load(ptr), ib = vx_load(ptr + VTraits<v_int32>::vlanes());
     a = v_cvt_f32(ia);
     b = v_cvt_f32(ib);
 }
 
 static inline void vx_load_pair_as(const int64_t* ptr, v_int32& a, v_int32& b)
 {
-    const int int64_nlanes = v_int64::nlanes;
+    const int int64_nlanes = VTraits<v_uint64>::vlanes();
     a = v_pack(vx_load(ptr), vx_load(ptr + int64_nlanes));
     b = v_pack(vx_load(ptr + int64_nlanes*2), vx_load(ptr + int64_nlanes*3));
 }
@@ -199,7 +199,7 @@ static inline void vx_load_pair_as(const int64_t* ptr, v_int32& a, v_int32& b)
 static inline void vx_load_pair_as(const int64_t* ptr, v_uint64& a, v_uint64& b)
 {
     v_int64 z = vx_setzero_s64();
-    v_int64 ia = vx_load(ptr), ib = vx_load(ptr + v_int64::nlanes);
+    v_int64 ia = vx_load(ptr), ib = vx_load(ptr + VTraits<v_uint64>::vlanes());
     ia &= (ia > z);
     ib &= (ib > z);
     a = v_reinterpret_as_u64(ia);
@@ -208,7 +208,7 @@ static inline void vx_load_pair_as(const int64_t* ptr, v_uint64& a, v_uint64& b)
 
 static inline void vx_load_pair_as(const int64_t* ptr, v_uint32& a, v_uint32& b)
 {
-    const int nlanes = v_int64::nlanes;
+    const int nlanes = VTraits<v_uint64>::vlanes();
     v_int64 z = vx_setzero_s64();
     v_int64 ia0 = vx_load(ptr), ia1 = vx_load(ptr + nlanes);
     v_int64 ib0 = vx_load(ptr + nlanes*2), ib1 = vx_load(ptr + nlanes*3);
@@ -222,8 +222,8 @@ static inline void vx_load_pair_as(const int64_t* ptr, v_uint32& a, v_uint32& b)
 
 static inline void vx_load_pair_as(const uint64_t* ptr, v_float32& a, v_float32& b)
 {
-    const int nlanes = v_uint64::nlanes;
-    float buf[v_uint64::nlanes*4];
+    const int nlanes = VTraits<v_uint64>::vlanes();
+    float buf[VTraits<v_uint64>::max_nlanes*4];
     for (int i = 0; i < nlanes*4; i++) {
         buf[i] = (float)ptr[i];
     }
@@ -233,8 +233,8 @@ static inline void vx_load_pair_as(const uint64_t* ptr, v_float32& a, v_float32&
 
 static inline void vx_load_pair_as(const int64_t* ptr, v_float32& a, v_float32& b)
 {
-    const int nlanes = v_int64::nlanes;
-    float buf[v_int64::nlanes*4];
+    const int nlanes = VTraits<v_uint64>::vlanes();
+    float buf[VTraits<v_uint64>::max_nlanes*4];
     for (int i = 0; i < nlanes*4; i++) {
         buf[i] = (float)ptr[i];
     }
@@ -277,21 +277,21 @@ static inline void vx_load_pair_as(const int* ptr, v_uint32& a, v_uint32& b)
 {
     v_int32 z = vx_setzero_s32();
     v_int32 ia = v_max(vx_load(ptr), z);
-    v_int32 ib = v_max(vx_load(ptr + v_int32::nlanes), z);
+    v_int32 ib = v_max(vx_load(ptr + VTraits<v_int32>::vlanes()), z);
     a = v_reinterpret_as_u32(ia);
     b = v_reinterpret_as_u32(ib);
 }
 
 static inline void vx_load_pair_as(const uint64_t* ptr, v_uint32& a, v_uint32& b)
 {
-    const int int64_nlanes = v_int64::nlanes;
+    const int int64_nlanes = VTraits<v_uint64>::vlanes();
     a = v_pack(vx_load(ptr), vx_load(ptr + int64_nlanes));
     b = v_pack(vx_load(ptr + int64_nlanes*2), vx_load(ptr + int64_nlanes*3));
 }
 
 static inline void vx_load_pair_as(const uint64_t* ptr, v_int32& a, v_int32& b)
 {
-    const int int64_nlanes = v_int64::nlanes;
+    const int int64_nlanes = VTraits<v_uint64>::vlanes();
     v_uint32 ua = v_pack(vx_load(ptr), vx_load(ptr + int64_nlanes));
     v_uint32 ub = v_pack(vx_load(ptr + int64_nlanes*2), vx_load(ptr + int64_nlanes*3));
     a = v_reinterpret_as_s32(ua);
@@ -299,37 +299,37 @@ static inline void vx_load_pair_as(const uint64_t* ptr, v_int32& a, v_int32& b)
 }
 
 static inline void vx_load_pair_as(const float* ptr, v_float32& a, v_float32& b)
-{ a = vx_load(ptr); b = vx_load(ptr + v_float32::nlanes); }
+{ a = vx_load(ptr); b = vx_load(ptr + VTraits<v_float32>::vlanes()); }
 
 static inline void vx_load_pair_as(const float16_t* ptr, v_float32& a, v_float32& b)
 {
     a = vx_load_expand(ptr);
-    b = vx_load_expand(ptr + v_float32::nlanes);
+    b = vx_load_expand(ptr + VTraits<v_float32>::vlanes());
 }
 
 static inline void vx_load_pair_as(const bfloat16_t* ptr, v_float32& a, v_float32& b)
 {
     a = vx_load_expand(ptr);
-    b = vx_load_expand(ptr + v_float32::nlanes);
+    b = vx_load_expand(ptr + VTraits<v_float32>::vlanes());
 }
 
 static inline void vx_load_pair_as(const unsigned* ptr, v_uint32& a, v_uint32& b)
 {
     a = vx_load(ptr);
-    b = vx_load(ptr + v_uint32::nlanes);
+    b = vx_load(ptr + VTraits<v_uint32>::vlanes());
 }
 
 static inline void vx_load_pair_as(const unsigned* ptr, v_int32& a, v_int32& b)
 {
     a = v_reinterpret_as_s32(vx_load(ptr));
-    b = v_reinterpret_as_s32(vx_load(ptr + v_uint32::nlanes));
+    b = v_reinterpret_as_s32(vx_load(ptr + VTraits<v_uint32>::vlanes()));
 }
 
 static inline void vx_load_pair_as(const unsigned* ptr, v_float32& a, v_float32& b)
 {
     v_uint32 delta = vx_setall_u32(0x80000000U);
     v_uint32 ua = vx_load(ptr);
-    v_uint32 ub = vx_load(ptr + v_uint32::nlanes);
+    v_uint32 ub = vx_load(ptr + VTraits<v_uint32>::vlanes());
     v_uint32 mask_a = (ua >= delta) & delta, mask_b = (ub >= delta) & delta;
     v_float32 fmask_a = v_cvt_f32(v_reinterpret_as_s32(mask_a)); // 0.f or (float)(-(1 << 31))
     v_float32 fmask_b = v_cvt_f32(v_reinterpret_as_s32(mask_b)); // 0.f or (float)(-(1 << 31))
@@ -353,7 +353,7 @@ static inline void v_store_pair_as(schar* ptr, const v_uint16& a, const v_uint16
 }
 
 static inline void v_store_pair_as(ushort* ptr, const v_uint16& a, const v_uint16& b)
-{ v_store(ptr, a); v_store(ptr + v_uint16::nlanes, b); }
+{ v_store(ptr, a); v_store(ptr + VTraits<v_uint16>::vlanes(), b); }
 
 static inline void v_store_pair_as(uchar* ptr, const v_int16& a, const v_int16& b)
 { v_store(ptr, v_pack_u(a, b)); }
@@ -362,7 +362,7 @@ static inline void v_store_pair_as(schar* ptr, const v_int16& a, const v_int16&
 { v_store(ptr, v_pack(a, b)); }
 
 static inline void v_store_pair_as(short* ptr, const v_int16& a, const v_int16& b)
-{ v_store(ptr, a); v_store(ptr + v_int16::nlanes, b); }
+{ v_store(ptr, a); v_store(ptr + VTraits<v_int16>::vlanes(), b); }
 
 static inline void v_store_pair_as(uchar* ptr, const v_int32& a, const v_int32& b)
 { v_pack_u_store(ptr, v_pack(a, b)); }
@@ -379,7 +379,7 @@ static inline void v_store_pair_as(short* ptr, const v_int32& a, const v_int32&
 static inline void v_store_pair_as(int* ptr, const v_int32& a, const v_int32& b)
 {
     v_store(ptr, a);
-    v_store(ptr + v_int32::nlanes, b);
+    v_store(ptr + VTraits<v_int32>::vlanes(), b);
 }
 
 static inline void v_store_pair_as(int64_t* ptr, const v_int32& a, const v_int32& b)
@@ -387,7 +387,7 @@ static inline void v_store_pair_as(int64_t* ptr, const v_int32& a, const v_int32
     v_int64 q0, q1, q2, q3;
     v_expand(a, q0, q1);
     v_expand(b, q2, q3);
-    const int nlanes = v_int64::nlanes;
+    const int nlanes = VTraits<v_uint64>::vlanes();
     v_store(ptr, q0);
     v_store(ptr + nlanes, q1);
     v_store(ptr + nlanes*2, q2);
@@ -419,11 +419,11 @@ static inline void v_store_pair_as(int* ptr, const v_float32& a, const v_float32
 {
     v_int32 ia = v_round(a), ib = v_round(b);
     v_store(ptr, ia);
-    v_store(ptr + v_int32::nlanes, ib);
+    v_store(ptr + VTraits<v_int32>::vlanes(), ib);
 }
 
 static inline void v_store_pair_as(float* ptr, const v_float32& a, const v_float32& b)
-{ v_store(ptr, a); v_store(ptr + v_float32::nlanes, b); }
+{ v_store(ptr, a); v_store(ptr + VTraits<v_float32>::vlanes(), b); }
 
 static inline void v_store_pair_as(unsigned* ptr, const v_float32& a, const v_float32& b)
 {
@@ -431,7 +431,7 @@ static inline void v_store_pair_as(unsigned* ptr, const v_float32& a, const v_fl
     v_int32 ia = v_max(v_round(a), z);
     v_int32 ib = v_max(v_round(b), z);
     v_store(ptr, v_reinterpret_as_u32(ia));
-    v_store(ptr + v_int32::nlanes, v_reinterpret_as_u32(ib));
+    v_store(ptr + VTraits<v_int32>::vlanes(), v_reinterpret_as_u32(ib));
 }
 
 static inline void v_store_pair_as(uchar* ptr, const v_uint32& a, const v_uint32& b)
@@ -447,7 +447,7 @@ static inline void v_store_pair_as(ushort* ptr, const v_uint32& a, const v_uint3
 static inline void v_store_pair_as(unsigned* ptr, const v_uint32& a, const v_uint32& b)
 {
     v_store(ptr, a);
-    v_store(ptr + v_uint32::nlanes, b);
+    v_store(ptr + VTraits<v_uint32>::vlanes(), b);
 }
 
 static inline void v_store_pair_as(uint64_t* ptr, const v_uint32& a, const v_uint32& b)
@@ -455,7 +455,7 @@ static inline void v_store_pair_as(uint64_t* ptr, const v_uint32& a, const v_uin
     v_uint64 q0, q1, q2, q3;
     v_expand(a, q0, q1);
     v_expand(b, q2, q3);
-    const int nlanes = v_uint64::nlanes;
+    const int nlanes = VTraits<v_uint64>::vlanes();
     v_store(ptr, q0);
     v_store(ptr + nlanes, q1);
     v_store(ptr + nlanes*2, q2);
@@ -465,28 +465,28 @@ static inline void v_store_pair_as(uint64_t* ptr, const v_uint32& a, const v_uin
 static inline void v_store_pair_as(uint64_t* ptr, const v_uint64& a, const v_uint64& b)
 {
     v_store(ptr, a);
-    v_store(ptr + v_uint64::nlanes, b);
+    v_store(ptr + VTraits<v_uint64>::vlanes(), b);
 }
 
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
 
 static inline void vx_load_as(const uint64_t* ptr, v_float32& a)
 {
     v_float64 a_0 = v_cvt_f64(v_reinterpret_as_s64(vx_load(ptr)));
-    v_float64 a_1 = v_cvt_f64(v_reinterpret_as_s64(vx_load(ptr + v_uint64::nlanes)));
+    v_float64 a_1 = v_cvt_f64(v_reinterpret_as_s64(vx_load(ptr + VTraits<v_uint64>::vlanes())));
     a = v_cvt_f32(a_0, a_1);
 }
 
 static inline void vx_load_as(const int64_t* ptr, v_float32& a)
 {
     v_float64 a_0 = v_cvt_f64(vx_load(ptr));
-    v_float64 a_1 = v_cvt_f64(vx_load(ptr + v_uint64::nlanes));
+    v_float64 a_1 = v_cvt_f64(vx_load(ptr + VTraits<v_uint64>::vlanes()));
     a = v_cvt_f32(a_0, a_1);
 }
 
 static inline void vx_load_as(const double* ptr, v_float32& a)
 {
-    v_float64 v0 = vx_load(ptr), v1 = vx_load(ptr + v_float64::nlanes);
+    v_float64 v0 = vx_load(ptr), v1 = vx_load(ptr + VTraits<v_float64>::vlanes());
     a = v_cvt_f32(v0, v1);
 }
 
@@ -516,8 +516,8 @@ static inline void vx_load_pair_as(const bfloat16_t* ptr, v_float64& a, v_float6
 
 static inline void vx_load_pair_as(const double* ptr, v_int32& a, v_int32& b)
 {
-    v_float64 v0 = vx_load(ptr), v1 = vx_load(ptr + v_float64::nlanes);
-    v_float64 v2 = vx_load(ptr + v_float64::nlanes*2), v3 = vx_load(ptr + v_float64::nlanes*3);
+    v_float64 v0 = vx_load(ptr), v1 = vx_load(ptr + VTraits<v_float64>::vlanes());
+    v_float64 v2 = vx_load(ptr + VTraits<v_float64>::vlanes()*2), v3 = vx_load(ptr + VTraits<v_float64>::vlanes()*3);
     v_int32 iv0 = v_round(v0), iv1 = v_round(v1);
     v_int32 iv2 = v_round(v2), iv3 = v_round(v3);
     a = v_combine_low(iv0, iv1);
@@ -526,15 +526,15 @@ static inline void vx_load_pair_as(const double* ptr, v_int32& a, v_int32& b)
 
 static inline void vx_load_pair_as(const uint64_t* ptr, v_float64& a, v_float64& b)
 {
-    const int int64_nlanes = v_int64::nlanes;
+    const int int64_nlanes = VTraits<v_uint64>::vlanes();
     a = v_cvt_f64(v_reinterpret_as_s64(vx_load(ptr)));
     b = v_cvt_f64(v_reinterpret_as_s64(vx_load(ptr + int64_nlanes)));
 }
 
 static inline void vx_load_pair_as(const double* ptr, v_float32& a, v_float32& b)
 {
-    v_float64 v0 = vx_load(ptr), v1 = vx_load(ptr + v_float64::nlanes);
-    v_float64 v2 = vx_load(ptr + v_float64::nlanes*2), v3 = vx_load(ptr + v_float64::nlanes*3);
+    v_float64 v0 = vx_load(ptr), v1 = vx_load(ptr + VTraits<v_float64>::vlanes());
+    v_float64 v2 = vx_load(ptr + VTraits<v_float64>::vlanes()*2), v3 = vx_load(ptr + VTraits<v_float64>::vlanes()*3);
     a = v_cvt_f32(v0, v1);
     b = v_cvt_f32(v2, v3);
 }
@@ -584,19 +584,19 @@ static inline void vx_load_pair_as(const float* ptr, v_float64& a, v_float64& b)
 static inline void vx_load_pair_as(const double* ptr, v_float64& a, v_float64& b)
 {
     a = vx_load(ptr);
-    b = vx_load(ptr + v_float64::nlanes);
+    b = vx_load(ptr + VTraits<v_float64>::vlanes());
 }
 
 static inline void vx_load_pair_as(const int64_t* ptr, v_float64& a, v_float64& b)
 {
     a = v_cvt_f64(vx_load(ptr));
-    b = v_cvt_f64(vx_load(ptr + v_float64::nlanes));
+    b = v_cvt_f64(vx_load(ptr + VTraits<v_float64>::vlanes()));
 }
 
 static inline void vx_load_pair_as(const unsigned* ptr, v_float64& a, v_float64& b)
 {
-    const int nlanes = v_uint64::nlanes;
-    double buf[v_uint64::nlanes*2];
+    const int nlanes = VTraits<v_uint64>::vlanes();
+    double buf[VTraits<v_uint64>::max_nlanes*2];
     for (int i = 0; i < nlanes*2; i++)
         buf[i] = (double)ptr[i];
     a = vx_load(buf);
@@ -607,7 +607,7 @@ static inline void v_store_as(double* ptr, const v_float32& a)
 {
     v_float64 fa0 = v_cvt_f64(a), fa1 = v_cvt_f64_high(a);
     v_store(ptr, fa0);
-    v_store(ptr + v_float64::nlanes, fa1);
+    v_store(ptr + VTraits<v_float64>::vlanes(), fa1);
 }
 
 static inline void v_store_pair_as(double* ptr, const v_int32& a, const v_int32& b)
@@ -616,9 +616,9 @@ static inline void v_store_pair_as(double* ptr, const v_int32& a, const v_int32&
     v_float64 fb0 = v_cvt_f64(b), fb1 = v_cvt_f64_high(b);
 
     v_store(ptr, fa0);
-    v_store(ptr + v_float64::nlanes, fa1);
-    v_store(ptr + v_float64::nlanes*2, fb0);
-    v_store(ptr + v_float64::nlanes*3, fb1);
+    v_store(ptr + VTraits<v_float64>::vlanes(), fa1);
+    v_store(ptr + VTraits<v_float64>::vlanes()*2, fb0);
+    v_store(ptr + VTraits<v_float64>::vlanes()*3, fb1);
 }
 
 static inline void v_store_pair_as(double* ptr, const v_float32& a, const v_float32& b)
@@ -627,15 +627,15 @@ static inline void v_store_pair_as(double* ptr, const v_float32& a, const v_floa
     v_float64 fb0 = v_cvt_f64(b), fb1 = v_cvt_f64_high(b);
 
     v_store(ptr, fa0);
-    v_store(ptr + v_float64::nlanes, fa1);
-    v_store(ptr + v_float64::nlanes*2, fb0);
-    v_store(ptr + v_float64::nlanes*3, fb1);
+    v_store(ptr + VTraits<v_float64>::vlanes(), fa1);
+    v_store(ptr + VTraits<v_float64>::vlanes()*2, fb0);
+    v_store(ptr + VTraits<v_float64>::vlanes()*3, fb1);
 }
 
 static inline void v_store_pair_as(double* ptr, const v_float64& a, const v_float64& b)
 {
     v_store(ptr, a);
-    v_store(ptr + v_float64::nlanes, b);
+    v_store(ptr + VTraits<v_float64>::vlanes(), b);
 }
 
 static inline void v_store_pair_as(int* ptr, const v_float64& a, const v_float64& b)
@@ -662,7 +662,7 @@ static inline void v_store_pair_as(uint64_t* ptr, const v_float64& a, const v_fl
     v_int64 ia, ib;
     v_expand(v_round(v_max(a, z), v_max(b, z)), ia, ib);
     v_store(ptr, v_reinterpret_as_u64(ia));
-    v_store(ptr + v_int64::nlanes, v_reinterpret_as_u64(ib));
+    v_store(ptr + VTraits<v_uint64>::vlanes(), v_reinterpret_as_u64(ib));
 }
 
 static inline void v_store_pair_as(int64_t* ptr, const v_float64& a, const v_float64& b)
@@ -670,7 +670,7 @@ static inline void v_store_pair_as(int64_t* ptr, const v_float64& a, const v_flo
     v_int64 ia, ib;
     v_expand(v_round(a, b), ia, ib);
     v_store(ptr, ia);
-    v_store(ptr + v_int64::nlanes, ib);
+    v_store(ptr + VTraits<v_uint64>::vlanes(), ib);
 }
 
 static inline void v_store_pair_as(unsigned* ptr, const v_float64& a, const v_float64& b)
@@ -744,9 +744,9 @@ static inline void v_store_pair_as(double* ptr, const _Tsvec& a, const _Tsvec& b
         ptr[i] = (double)buf[i];
 }
 
-#endif /////////// CV_SIMD_64F
+#endif /////////// CV_SIMD_64F || CV_SIMD_SCALABLE_64F
 
-#endif /////////// CV_SIMD
+#endif /////////// CV_SIMD || CV_SIMD_SCALABLE
 
 }
 
diff --git a/modules/core/src/convert.simd.hpp b/modules/core/src/convert.simd.hpp
index c776918846..84161b2da7 100644
--- a/modules/core/src/convert.simd.hpp
+++ b/modules/core/src/convert.simd.hpp
@@ -41,8 +41,8 @@ void cvt16f32f( const float16_t* src, float* dst, int len )
 {
     CV_INSTRUMENT_REGION();
     int j = 0;
-#if CV_SIMD
-    const int VECSZ = v_float32::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int VECSZ = VTraits<v_float32>::vlanes();
     for( ; j < len; j += VECSZ )
     {
         if( j > len - VECSZ )
@@ -62,8 +62,8 @@ void cvt32f16f( const float* src, float16_t* dst, int len )
 {
     CV_INSTRUMENT_REGION();
     int j = 0;
-#if CV_SIMD
-    const int VECSZ = v_float32::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int VECSZ = VTraits<v_float32>::vlanes();
     for( ; j < len; j += VECSZ )
     {
         if( j > len - VECSZ )
@@ -83,8 +83,8 @@ void cvt32f16bf( const float* src, bfloat16_t* dst, int len )
 {
     CV_INSTRUMENT_REGION();
     int j = 0;
-#if CV_SIMD
-    const int VECSZ = v_float32::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    const int VECSZ = VTraits<v_float32>::vlanes();
     for( ; j < len; j += VECSZ )
     {
         if( j > len - VECSZ )
@@ -153,8 +153,8 @@ cvt_( const _Ts* src, size_t sstep, _Td* dst, size_t dstep, Size size )
     for( int i = 0; i < size.height; i++, src += sstep, dst += dstep )
     {
         int j = 0;
-#if CV_SIMD
-        const int VECSZ = _Twvec::nlanes*2;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        const int VECSZ = VTraits<_Twvec>::vlanes()*2;
         for( ; j < size.width; j += VECSZ )
         {
             if( j > size.width - VECSZ )
@@ -182,8 +182,8 @@ cvt_64f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep, Size size )
     for( int i = 0; i < size.height; i++, src += sstep, dst += dstep )
     {
         int j = 0;
-#if CV_SIMD_64F
-        const int VECSZ = v_float64::nlanes*2;
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
+        const int VECSZ = VTraits<v_float64>::vlanes()*2;
         for( ; j < size.width; j += VECSZ )
         {
             if( j > size.width - VECSZ )
@@ -213,8 +213,8 @@ cvt1_( const _Ts* src, size_t sstep, _Td* dst, size_t dstep, Size size )
     for( int i = 0; i < size.height; i++, src += sstep, dst += dstep )
     {
         int j = 0;
-#if CV_SIMD
-        const int VECSZ = _Twvec::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        const int VECSZ = VTraits<_Twvec>::vlanes();
         for( ; j < size.width; j += VECSZ )
         {
             if( j > size.width - VECSZ )
diff --git a/modules/core/src/convert_scale.simd.hpp b/modules/core/src/convert_scale.simd.hpp
index f1ee7635e7..e29fe06a9b 100644
--- a/modules/core/src/convert_scale.simd.hpp
+++ b/modules/core/src/convert_scale.simd.hpp
@@ -22,9 +22,9 @@ template<typename _Ts, typename _Td> inline void
 cvtabs_32f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep,
             Size size, float a, float b )
 {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     v_float32 va = vx_setall_f32(a), vb = vx_setall_f32(b);
-    const int VECSZ = v_float32::nlanes*2;
+    const int VECSZ = VTraits<v_float32>::vlanes()*2;
 #endif
     sstep /= sizeof(src[0]);
     dstep /= sizeof(dst[0]);
@@ -32,7 +32,7 @@ cvtabs_32f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep,
     for( int i = 0; i < size.height; i++, src += sstep, dst += dstep )
     {
         int j = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         for( ; j < size.width; j += VECSZ )
         {
             if( j > size.width - VECSZ )
@@ -72,9 +72,9 @@ template<typename _Ts, typename _Td> inline void
 cvt_32f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep,
          Size size, float a, float b )
 {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     v_float32 va = vx_setall_f32(a), vb = vx_setall_f32(b);
-    const int VECSZ = v_float32::nlanes*2;
+    const int VECSZ = VTraits<v_float32>::vlanes()*2;
 #endif
     sstep /= sizeof(src[0]);
     dstep /= sizeof(dst[0]);
@@ -82,7 +82,7 @@ cvt_32f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep,
     for( int i = 0; i < size.height; i++, src += sstep, dst += dstep )
     {
         int j = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         for( ; j < size.width; j += VECSZ )
         {
             if( j > size.width - VECSZ )
@@ -108,9 +108,9 @@ template<typename _Ts, typename _Td> inline void
 cvt1_32f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep,
           Size size, float a, float b )
 {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     v_float32 va = vx_setall_f32(a), vb = vx_setall_f32(b);
-    const int VECSZ = v_float32::nlanes;
+    const int VECSZ = VTraits<v_float32>::vlanes();
 #endif
     sstep /= sizeof(src[0]);
     dstep /= sizeof(dst[0]);
@@ -118,7 +118,7 @@ cvt1_32f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep,
     for( int i = 0; i < size.height; i++, src += sstep, dst += dstep )
     {
         int j = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
         for( ; j < size.width; j += VECSZ )
         {
             if( j > size.width - VECSZ )
@@ -143,9 +143,9 @@ template<typename _Ts, typename _Td> inline void
 cvt_64f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep,
          Size size, double a, double b )
 {
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
     v_float64 va = vx_setall_f64(a), vb = vx_setall_f64(b);
-    const int VECSZ = v_float64::nlanes*2;
+    const int VECSZ = VTraits<v_float64>::vlanes()*2;
 #endif
     sstep /= sizeof(src[0]);
     dstep /= sizeof(dst[0]);
@@ -153,7 +153,7 @@ cvt_64f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep,
     for( int i = 0; i < size.height; i++, src += sstep, dst += dstep )
     {
         int j = 0;
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
         for( ; j < size.width; j += VECSZ )
         {
             if( j > size.width - VECSZ )
diff --git a/modules/core/src/copy.cpp b/modules/core/src/copy.cpp
index 09250b8585..1b14c53ab0 100644
--- a/modules/core/src/copy.cpp
+++ b/modules/core/src/copy.cpp
@@ -171,15 +171,15 @@ copyMask_<uchar>(const uchar* _src, size_t sstep, const uchar* mask, size_t mste
         const uchar* src = (const uchar*)_src;
         uchar* dst = (uchar*)_dst;
         int x = 0;
-        #if CV_SIMD
+        #if (CV_SIMD || CV_SIMD_SCALABLE)
         {
             v_uint8 v_zero = vx_setzero_u8();
 
-            for( ; x <= size.width - v_uint8::nlanes; x += v_uint8::nlanes )
+            for( ; x <= size.width - VTraits<v_uint8>::vlanes(); x += VTraits<v_uint8>::vlanes() )
             {
                 v_uint8 v_src   = vx_load(src  + x),
                         v_dst   = vx_load(dst  + x),
-                        v_nmask = vx_load(mask + x) == v_zero;
+                        v_nmask = v_eq(vx_load(mask + x), v_zero);
 
                 v_dst = v_select(v_nmask, v_dst, v_src);
                 v_store(dst + x, v_dst);
@@ -203,23 +203,23 @@ copyMask_<ushort>(const uchar* _src, size_t sstep, const uchar* mask, size_t mst
         const ushort* src = (const ushort*)_src;
         ushort* dst = (ushort*)_dst;
         int x = 0;
-        #if CV_SIMD
+        #if (CV_SIMD || CV_SIMD_SCALABLE)
         {
             v_uint8 v_zero = vx_setzero_u8();
 
-            for( ; x <= size.width - v_uint8::nlanes; x += v_uint8::nlanes )
+            for( ; x <= size.width - VTraits<v_uint8>::vlanes(); x += VTraits<v_uint8>::vlanes() )
             {
-                v_uint16 v_src1 = vx_load(src + x), v_src2 = vx_load(src + x + v_uint16::nlanes),
-                         v_dst1 = vx_load(dst + x), v_dst2 = vx_load(dst + x + v_uint16::nlanes);
+                v_uint16 v_src1 = vx_load(src + x), v_src2 = vx_load(src + x + VTraits<v_uint16>::vlanes()),
+                         v_dst1 = vx_load(dst + x), v_dst2 = vx_load(dst + x + VTraits<v_uint16>::vlanes());
 
                 v_uint8 v_nmask1, v_nmask2;
-                v_uint8 v_nmask = vx_load(mask + x) == v_zero;
+                v_uint8 v_nmask = v_eq(vx_load(mask + x), v_zero);
                 v_zip(v_nmask, v_nmask, v_nmask1, v_nmask2);
 
                 v_dst1 = v_select(v_reinterpret_as_u16(v_nmask1), v_dst1, v_src1);
                 v_dst2 = v_select(v_reinterpret_as_u16(v_nmask2), v_dst2, v_src2);
                 v_store(dst + x, v_dst1);
-                v_store(dst + x + v_uint16::nlanes, v_dst2);
+                v_store(dst + x + VTraits<v_uint16>::vlanes(), v_dst2);
             }
         }
         vx_cleanup();
diff --git a/modules/core/src/count_non_zero.simd.hpp b/modules/core/src/count_non_zero.simd.hpp
index 6994564127..ce7c75aa54 100644
--- a/modules/core/src/count_non_zero.simd.hpp
+++ b/modules/core/src/count_non_zero.simd.hpp
@@ -32,8 +32,8 @@ static int countNonZero_(const T* src, int len )
 static int countNonZero8u( const uchar* src, int len )
 {
     int i=0, nz = 0;
-#if CV_SIMD
-    int len0 = len & -v_uint8::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    int len0 = len & -VTraits<v_uint8>::vlanes();
     v_uint8 v_zero = vx_setzero_u8();
     v_uint8 v_one = vx_setall_u8(1);
 
@@ -42,20 +42,20 @@ static int countNonZero8u( const uchar* src, int len )
     {
         v_uint16 v_sum16 = vx_setzero_u16();
         int j = i;
-        while (j < std::min(len0, i + 65280 * v_uint16::nlanes))
+        while (j < std::min(len0, i + 65280 * VTraits<v_uint16>::vlanes()))
         {
             v_uint8 v_sum8 = vx_setzero_u8();
             int k = j;
-            for (; k < std::min(len0, j + 255 * v_uint8::nlanes); k += v_uint8::nlanes)
-                v_sum8 += v_one & (vx_load(src + k) == v_zero);
+            for (; k < std::min(len0, j + 255 * VTraits<v_uint8>::vlanes()); k += VTraits<v_uint8>::vlanes())
+                v_sum8 = v_add(v_sum8, v_and(v_one, v_eq(vx_load(src + k), v_zero)));
             v_uint16 part1, part2;
             v_expand(v_sum8, part1, part2);
-            v_sum16 += part1 + part2;
+            v_sum16 = v_add(v_sum16, v_add(part1, part2));
             j = k;
         }
         v_uint32 part1, part2;
         v_expand(v_sum16, part1, part2);
-        v_sum32 += part1 + part2;
+        v_sum32 = v_add(v_sum32, v_add(part1, part2));
         i = j;
     }
     nz = i - v_reduce_sum(v_sum32);
@@ -69,8 +69,8 @@ static int countNonZero8u( const uchar* src, int len )
 static int countNonZero16u( const ushort* src, int len )
 {
     int i = 0, nz = 0;
-#if CV_SIMD
-    int len0 = len & -v_int8::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    int len0 = len & -VTraits<v_int8>::vlanes();
     v_uint16 v_zero = vx_setzero_u16();
     v_int8 v_one = vx_setall_s8(1);
 
@@ -79,20 +79,20 @@ static int countNonZero16u( const ushort* src, int len )
     {
         v_int16 v_sum16 = vx_setzero_s16();
         int j = i;
-        while (j < std::min(len0, i + 32766 * v_int16::nlanes))
+        while (j < std::min(len0, i + 32766 * VTraits<v_int16>::vlanes()))
         {
             v_int8 v_sum8 = vx_setzero_s8();
             int k = j;
-            for (; k < std::min(len0, j + 127 * v_int8::nlanes); k += v_int8::nlanes)
-                v_sum8 += v_one & v_pack(v_reinterpret_as_s16(vx_load(src + k) == v_zero), v_reinterpret_as_s16(vx_load(src + k + v_uint16::nlanes) == v_zero));
+            for (; k < std::min(len0, j + 127 * VTraits<v_int8>::vlanes()); k += VTraits<v_int8>::vlanes())
+                v_sum8 = v_add(v_sum8, v_and(v_one, v_pack(v_reinterpret_as_s16(v_eq(vx_load(src + k), v_zero)), v_reinterpret_as_s16(v_eq(vx_load(src + k + VTraits<v_uint16>::vlanes()), v_zero)))));
             v_int16 part1, part2;
             v_expand(v_sum8, part1, part2);
-            v_sum16 += part1 + part2;
+            v_sum16 = v_add(v_sum16, v_add(part1, part2));
             j = k;
         }
         v_int32 part1, part2;
         v_expand(v_sum16, part1, part2);
-        v_sum32 += part1 + part2;
+        v_sum32 = v_add(v_sum32, v_add(part1, part2));
         i = j;
     }
     nz = i - v_reduce_sum(v_sum32);
@@ -104,8 +104,8 @@ static int countNonZero16u( const ushort* src, int len )
 static int countNonZero32s( const int* src, int len )
 {
     int i = 0, nz = 0;
-#if CV_SIMD
-    int len0 = len & -v_int8::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    int len0 = len & -VTraits<v_int8>::vlanes();
     v_int32 v_zero = vx_setzero_s32();
     v_int8 v_one = vx_setall_s8(1);
 
@@ -114,23 +114,20 @@ static int countNonZero32s( const int* src, int len )
     {
         v_int16 v_sum16 = vx_setzero_s16();
         int j = i;
-        while (j < std::min(len0, i + 32766 * v_int16::nlanes))
+        while (j < std::min(len0, i + 32766 * VTraits<v_int16>::vlanes()))
         {
             v_int8 v_sum8 = vx_setzero_s8();
             int k = j;
-            for (; k < std::min(len0, j + 127 * v_int8::nlanes); k += v_int8::nlanes)
-                v_sum8 += v_one & v_pack(
-                    v_pack(vx_load(src + k                    ) == v_zero, vx_load(src + k +   v_int32::nlanes) == v_zero),
-                    v_pack(vx_load(src + k + 2*v_int32::nlanes) == v_zero, vx_load(src + k + 3*v_int32::nlanes) == v_zero)
-                );
+            for (; k < std::min(len0, j + 127 * VTraits<v_int8>::vlanes()); k += VTraits<v_int8>::vlanes())
+                v_sum8 = v_add(v_sum8, v_and(v_one, v_pack(v_pack(v_eq(vx_load(src + k), v_zero), v_eq(vx_load(src + k + VTraits<v_int32>::vlanes()), v_zero)), v_pack(v_eq(vx_load(src + k + 2 * VTraits<v_int32>::vlanes()), v_zero), v_eq(vx_load(src + k + 3 * VTraits<v_int32>::vlanes()), v_zero)))));
             v_int16 part1, part2;
             v_expand(v_sum8, part1, part2);
-            v_sum16 += part1 + part2;
+            v_sum16 = v_add(v_sum16, v_add(part1, part2));
             j = k;
         }
         v_int32 part1, part2;
         v_expand(v_sum16, part1, part2);
-        v_sum32 += part1 + part2;
+        v_sum32 = v_add(v_sum32, v_add(part1, part2));
         i = j;
     }
     nz = i - v_reduce_sum(v_sum32);
@@ -142,8 +139,8 @@ static int countNonZero32s( const int* src, int len )
 static int countNonZero32f( const float* src, int len )
 {
     int i = 0, nz = 0;
-#if CV_SIMD
-    int len0 = len & -v_int8::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    int len0 = len & -VTraits<v_int8>::vlanes();
     v_float32 v_zero = vx_setzero_f32();
     v_int8 v_one = vx_setall_s8(1);
 
@@ -152,23 +149,20 @@ static int countNonZero32f( const float* src, int len )
     {
         v_int16 v_sum16 = vx_setzero_s16();
         int j = i;
-        while (j < std::min(len0, i + 32766 * v_int16::nlanes))
+        while (j < std::min(len0, i + 32766 * VTraits<v_int16>::vlanes()))
         {
             v_int8 v_sum8 = vx_setzero_s8();
             int k = j;
-            for (; k < std::min(len0, j + 127 * v_int8::nlanes); k += v_int8::nlanes)
-                v_sum8 += v_one & v_pack(
-                    v_pack(v_reinterpret_as_s32(vx_load(src + k                      ) == v_zero), v_reinterpret_as_s32(vx_load(src + k +   v_float32::nlanes) == v_zero)),
-                    v_pack(v_reinterpret_as_s32(vx_load(src + k + 2*v_float32::nlanes) == v_zero), v_reinterpret_as_s32(vx_load(src + k + 3*v_float32::nlanes) == v_zero))
-                );
+            for (; k < std::min(len0, j + 127 * VTraits<v_int8>::vlanes()); k += VTraits<v_int8>::vlanes())
+                v_sum8 = v_add(v_sum8, v_and(v_one, v_pack(v_pack(v_reinterpret_as_s32(v_eq(vx_load(src + k), v_zero)), v_reinterpret_as_s32(v_eq(vx_load(src + k + VTraits<v_float32>::vlanes()), v_zero))), v_pack(v_reinterpret_as_s32(v_eq(vx_load(src + k + 2 * VTraits<v_float32>::vlanes()), v_zero)), v_reinterpret_as_s32(v_eq(vx_load(src + k + 3 * VTraits<v_float32>::vlanes()), v_zero))))));
             v_int16 part1, part2;
             v_expand(v_sum8, part1, part2);
-            v_sum16 += part1 + part2;
+            v_sum16 = v_add(v_sum16, v_add(part1, part2));
             j = k;
         }
         v_int32 part1, part2;
         v_expand(v_sum16, part1, part2);
-        v_sum32 += part1 + part2;
+        v_sum32 = v_add(v_sum32, v_add(part1, part2));
         i = j;
     }
     nz = i - v_reduce_sum(v_sum32);
@@ -180,21 +174,21 @@ static int countNonZero32f( const float* src, int len )
 static int countNonZero64f( const double* src, int len )
 {
     int nz = 0, i = 0;
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
     v_int64 sum1 = vx_setzero_s64();
     v_int64 sum2 = vx_setzero_s64();
     v_float64 zero = vx_setzero_f64();
-    int step = v_float64::nlanes * 2;
+    int step = VTraits<v_float64>::vlanes() * 2;
     int len0 = len & -step;
 
     for(i = 0; i < len0; i += step )
         {
-        sum1 += v_reinterpret_as_s64(vx_load(&src[i]) == zero);
-        sum2 += v_reinterpret_as_s64(vx_load(&src[i + step / 2]) == zero);
+        sum1 = v_add(sum1, v_reinterpret_as_s64(v_eq(vx_load(&src[i]), zero)));
+        sum2 = v_add(sum2, v_reinterpret_as_s64(v_eq(vx_load(&src[i + step / 2]), zero)));
         }
 
     // N.B the value is incremented by -1 (0xF...F) for each value
-    nz = i + (int)v_reduce_sum(sum1 + sum2);
+    nz = i + (int)v_reduce_sum(v_add(sum1, sum2));
     v_cleanup();
 #endif
     return nz + countNonZero_(src + i, len - i);
diff --git a/modules/core/src/lapack.cpp b/modules/core/src/lapack.cpp
index a644fe15a7..43c6d07d58 100644
--- a/modules/core/src/lapack.cpp
+++ b/modules/core/src/lapack.cpp
@@ -274,22 +274,21 @@ template<typename T> struct VBLAS
 {
     int dot(const T*, const T*, int, T*) const { return 0; }
     int givens(T*, T*, int, T, T) const { return 0; }
-    int givensx(T*, T*, int, T, T, T*, T*) const { return 0; }
 };
 
-#if CV_SIMD
+#if CV_SIMD // TODO: enable for CV_SIMD_SCALABLE_64F
 template<> inline int VBLAS<float>::dot(const float* a, const float* b, int n, float* result) const
 {
-    if( n < 2*v_float32::nlanes )
+    if( n < 2*VTraits<v_float32>::vlanes() )
         return 0;
     int k = 0;
     v_float32 s0 = vx_setzero_f32();
-    for( ; k <= n - v_float32::nlanes; k += v_float32::nlanes )
+    for( ; k <= n - VTraits<v_float32>::vlanes(); k += VTraits<v_float32>::vlanes() )
     {
         v_float32 a0 = vx_load(a + k);
         v_float32 b0 = vx_load(b + k);
 
-        s0 += a0 * b0;
+        s0 = v_add(s0, v_mul(a0, b0));
     }
     *result = v_reduce_sum(s0);
     vx_cleanup();
@@ -299,16 +298,16 @@ template<> inline int VBLAS<float>::dot(const float* a, const float* b, int n, f
 
 template<> inline int VBLAS<float>::givens(float* a, float* b, int n, float c, float s) const
 {
-    if( n < v_float32::nlanes)
+    if( n < VTraits<v_float32>::vlanes())
         return 0;
     int k = 0;
     v_float32 c4 = vx_setall_f32(c), s4 = vx_setall_f32(s);
-    for( ; k <= n - v_float32::nlanes; k += v_float32::nlanes )
+    for( ; k <= n - VTraits<v_float32>::vlanes(); k += VTraits<v_float32>::vlanes() )
     {
         v_float32 a0 = vx_load(a + k);
         v_float32 b0 = vx_load(b + k);
-        v_float32 t0 = (a0 * c4) + (b0 * s4);
-        v_float32 t1 = (b0 * c4) - (a0 * s4);
+        v_float32 t0 = v_add(v_mul(a0, c4), v_mul(b0, s4));
+        v_float32 t1 = v_sub(v_mul(b0, c4), v_mul(a0, s4));
         v_store(a + k, t0);
         v_store(b + k, t1);
     }
@@ -317,44 +316,19 @@ template<> inline int VBLAS<float>::givens(float* a, float* b, int n, float c, f
 }
 
 
-template<> inline int VBLAS<float>::givensx(float* a, float* b, int n, float c, float s,
-                                             float* anorm, float* bnorm) const
-{
-    if( n < v_float32::nlanes)
-        return 0;
-    int k = 0;
-    v_float32 c4 = vx_setall_f32(c), s4 = vx_setall_f32(s);
-    v_float32 sa = vx_setzero_f32(), sb = vx_setzero_f32();
-    for( ; k <= n - v_float32::nlanes; k += v_float32::nlanes )
-    {
-        v_float32 a0 = vx_load(a + k);
-        v_float32 b0 = vx_load(b + k);
-        v_float32 t0 = (a0 * c4) + (b0 * s4);
-        v_float32 t1 = (b0 * c4) - (a0 * s4);
-        v_store(a + k, t0);
-        v_store(b + k, t1);
-        sa += t0 + t0;
-        sb += t1 + t1;
-    }
-    *anorm = v_reduce_sum(sa);
-    *bnorm = v_reduce_sum(sb);
-    vx_cleanup();
-    return k;
-}
-
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
 template<> inline int VBLAS<double>::dot(const double* a, const double* b, int n, double* result) const
 {
-    if( n < 2*v_float64::nlanes )
+    if( n < 2*VTraits<v_float64>::vlanes() )
         return 0;
     int k = 0;
     v_float64 s0 = vx_setzero_f64();
-    for( ; k <= n - v_float64::nlanes; k += v_float64::nlanes )
+    for( ; k <= n - VTraits<v_float64>::vlanes(); k += VTraits<v_float64>::vlanes() )
     {
         v_float64 a0 = vx_load(a + k);
         v_float64 b0 = vx_load(b + k);
 
-        s0 += a0 * b0;
+        s0 = v_add(s0, v_mul(a0, b0));
     }
     double sbuf[2];
     v_store(sbuf, s0);
@@ -368,12 +342,12 @@ template<> inline int VBLAS<double>::givens(double* a, double* b, int n, double
 {
     int k = 0;
     v_float64 c2 = vx_setall_f64(c), s2 = vx_setall_f64(s);
-    for( ; k <= n - v_float64::nlanes; k += v_float64::nlanes )
+    for( ; k <= n - VTraits<v_float64>::vlanes(); k += VTraits<v_float64>::vlanes() )
     {
         v_float64 a0 = vx_load(a + k);
         v_float64 b0 = vx_load(b + k);
-        v_float64 t0 = (a0 * c2) + (b0 * s2);
-        v_float64 t1 = (b0 * c2) - (a0 * s2);
+        v_float64 t0 = v_add(v_mul(a0, c2), v_mul(b0, s2));
+        v_float64 t1 = v_sub(v_mul(b0, c2), v_mul(a0, s2));
         v_store(a + k, t0);
         v_store(b + k, t1);
     }
@@ -382,30 +356,6 @@ template<> inline int VBLAS<double>::givens(double* a, double* b, int n, double
 }
 
 
-template<> inline int VBLAS<double>::givensx(double* a, double* b, int n, double c, double s,
-                                              double* anorm, double* bnorm) const
-{
-    int k = 0;
-    v_float64 c2 = vx_setall_f64(c), s2 = vx_setall_f64(s);
-    v_float64 sa = vx_setzero_f64(), sb = vx_setzero_f64();
-    for( ; k <= n - v_float64::nlanes; k += v_float64::nlanes )
-    {
-        v_float64 a0 = vx_load(a + k);
-        v_float64 b0 = vx_load(b + k);
-        v_float64 t0 = (a0 * c2) + (b0 * s2);
-        v_float64 t1 = (b0 * c2) - (a0 * s2);
-        v_store(a + k, t0);
-        v_store(b + k, t1);
-        sa += t0 * t0;
-        sb += t1 * t1;
-    }
-    double abuf[2], bbuf[2];
-    v_store(abuf, sa);
-    v_store(bbuf, sb);
-    *anorm = abuf[0] + abuf[1];
-    *bnorm = bbuf[0] + bbuf[1];
-    return k;
-}
 #endif //CV_SIMD_64F
 #endif //CV_SIMD
 
@@ -916,7 +866,7 @@ double invert( InputArray _src, OutputArray _dst, int method )
                 #if CV_SIMD128
                     const float d_32f = (float)d;
                     const v_float32x4 d_vec(d_32f, -d_32f, -d_32f, d_32f);
-                    v_float32x4 s0 = v_load_halves((const float*)srcdata, (const float*)(srcdata + srcstep)) * d_vec;//0123//3120
+                    v_float32x4 s0 = v_mul(v_load_halves((const float *)srcdata, (const float *)(srcdata + srcstep)), d_vec);//0123//3120
                     s0 = v_extract<3>(s0, v_combine_low(v_rotate_right<1>(s0), s0));
                     v_store_low((float*)dstdata, s0);
                     v_store_high((float*)(dstdata + dststep), s0);
@@ -942,10 +892,10 @@ double invert( InputArray _src, OutputArray _dst, int method )
                     d = 1./d;
                 #if CV_SIMD128_64F
                     v_float64x2 det = v_setall_f64(d);
-                    v_float64x2 s0 = v_load((const double*)srcdata) * det;
-                    v_float64x2 s1 = v_load((const double*)(srcdata+srcstep)) * det;
+                    v_float64x2 s0 = v_mul(v_load((const double *)srcdata), det);
+                    v_float64x2 s1 = v_mul(v_load((const double *)(srcdata + srcstep)), det);
                     v_float64x2 sm = v_extract<1>(s1, s0);//30
-                    v_float64x2 ss = v_setall<double>(0) - v_extract<1>(s0, s1);//12
+                    v_float64x2 ss = v_sub(v_setall<double>(0), v_extract<1>(s0, s1));//12
                     v_store((double*)dstdata, v_combine_low(sm, ss));//31
                     v_store((double*)(dstdata + dststep), v_combine_high(ss, sm));//20
                 #else
diff --git a/modules/core/src/mathfuncs.cpp b/modules/core/src/mathfuncs.cpp
index 05c6d3bd1f..0bec1be6c0 100644
--- a/modules/core/src/mathfuncs.cpp
+++ b/modules/core/src/mathfuncs.cpp
@@ -614,13 +614,13 @@ void polarToCart( InputArray src1, InputArray src2,
                 {
                     k = 0;
 
-#if CV_SIMD
-                    int cWidth = v_float32::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+                    int cWidth = VTraits<v_float32>::vlanes();
                     for( ; k <= len - cWidth; k += cWidth )
                     {
                         v_float32 v_m = vx_load(mag + k);
-                        v_store(x + k, vx_load(x + k) * v_m);
-                        v_store(y + k, vx_load(y + k) * v_m);
+                        v_store(x + k, v_mul(vx_load(x + k), v_m));
+                        v_store(y + k, v_mul(vx_load(y + k), v_m));
                     }
                     vx_cleanup();
 #endif
@@ -741,7 +741,7 @@ struct iPow_SIMD
     }
 };
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 
 template <>
 struct iPow_SIMD<uchar, int>
@@ -751,7 +751,7 @@ struct iPow_SIMD<uchar, int>
         int i = 0;
         v_uint32 v_1 = vx_setall_u32(1u);
 
-        for ( ; i <= len - v_uint16::nlanes; i += v_uint16::nlanes)
+        for ( ; i <= len - VTraits<v_uint16>::vlanes(); i += VTraits<v_uint16>::vlanes())
         {
             v_uint32 v_a1 = v_1, v_a2 = v_1;
             v_uint16 v = vx_load_expand(src + i);
@@ -763,16 +763,16 @@ struct iPow_SIMD<uchar, int>
             {
                 if (p & 1)
                 {
-                    v_a1 *= v_b1;
-                    v_a2 *= v_b2;
+                    v_a1 = v_mul(v_a1, v_b1);
+                    v_a2 = v_mul(v_a2, v_b2);
                 }
-                v_b1 *= v_b1;
-                v_b2 *= v_b2;
+                v_b1 = v_mul(v_b1, v_b1);
+                v_b2 = v_mul(v_b2, v_b2);
                 p >>= 1;
             }
 
-            v_a1 *= v_b1;
-            v_a2 *= v_b2;
+            v_a1 = v_mul(v_a1, v_b1);
+            v_a2 = v_mul(v_a2, v_b2);
 
             v = v_pack(v_a1, v_a2);
             v_pack_store(dst + i, v);
@@ -791,7 +791,7 @@ struct iPow_SIMD<schar, int>
         int i = 0;
         v_int32 v_1 = vx_setall_s32(1);
 
-        for ( ; i <= len - v_int16::nlanes; i += v_int16::nlanes)
+        for ( ; i <= len - VTraits<v_int16>::vlanes(); i += VTraits<v_int16>::vlanes())
         {
             v_int32 v_a1 = v_1, v_a2 = v_1;
             v_int16 v = vx_load_expand(src + i);
@@ -803,16 +803,16 @@ struct iPow_SIMD<schar, int>
             {
                 if (p & 1)
                 {
-                    v_a1 *= v_b1;
-                    v_a2 *= v_b2;
+                    v_a1 = v_mul(v_a1, v_b1);
+                    v_a2 = v_mul(v_a2, v_b2);
                 }
-                v_b1 *= v_b1;
-                v_b2 *= v_b2;
+                v_b1 = v_mul(v_b1, v_b1);
+                v_b2 = v_mul(v_b2, v_b2);
                 p >>= 1;
             }
 
-            v_a1 *= v_b1;
-            v_a2 *= v_b2;
+            v_a1 = v_mul(v_a1, v_b1);
+            v_a2 = v_mul(v_a2, v_b2);
 
             v = v_pack(v_a1, v_a2);
             v_pack_store(dst + i, v);
@@ -831,7 +831,7 @@ struct iPow_SIMD<ushort, int>
         int i = 0;
         v_uint32 v_1 = vx_setall_u32(1u);
 
-        for ( ; i <= len - v_uint16::nlanes; i += v_uint16::nlanes)
+        for ( ; i <= len - VTraits<v_uint16>::vlanes(); i += VTraits<v_uint16>::vlanes())
         {
             v_uint32 v_a1 = v_1, v_a2 = v_1;
             v_uint16 v = vx_load(src + i);
@@ -843,16 +843,16 @@ struct iPow_SIMD<ushort, int>
             {
                 if (p & 1)
                 {
-                    v_a1 *= v_b1;
-                    v_a2 *= v_b2;
+                    v_a1 = v_mul(v_a1, v_b1);
+                    v_a2 = v_mul(v_a2, v_b2);
                 }
-                v_b1 *= v_b1;
-                v_b2 *= v_b2;
+                v_b1 = v_mul(v_b1, v_b1);
+                v_b2 = v_mul(v_b2, v_b2);
                 p >>= 1;
             }
 
-            v_a1 *= v_b1;
-            v_a2 *= v_b2;
+            v_a1 = v_mul(v_a1, v_b1);
+            v_a2 = v_mul(v_a2, v_b2);
 
             v = v_pack(v_a1, v_a2);
             v_store(dst + i, v);
@@ -871,7 +871,7 @@ struct iPow_SIMD<short, int>
         int i = 0;
         v_int32 v_1 = vx_setall_s32(1);
 
-        for ( ; i <= len - v_int16::nlanes; i += v_int16::nlanes)
+        for ( ; i <= len - VTraits<v_int16>::vlanes(); i += VTraits<v_int16>::vlanes())
         {
             v_int32 v_a1 = v_1, v_a2 = v_1;
             v_int16 v = vx_load(src + i);
@@ -883,16 +883,16 @@ struct iPow_SIMD<short, int>
             {
                 if (p & 1)
                 {
-                    v_a1 *= v_b1;
-                    v_a2 *= v_b2;
+                    v_a1 = v_mul(v_a1, v_b1);
+                    v_a2 = v_mul(v_a2, v_b2);
                 }
-                v_b1 *= v_b1;
-                v_b2 *= v_b2;
+                v_b1 = v_mul(v_b1, v_b1);
+                v_b2 = v_mul(v_b2, v_b2);
                 p >>= 1;
             }
 
-            v_a1 *= v_b1;
-            v_a2 *= v_b2;
+            v_a1 = v_mul(v_a1, v_b1);
+            v_a2 = v_mul(v_a2, v_b2);
 
             v = v_pack(v_a1, v_a2);
             v_store(dst + i, v);
@@ -911,29 +911,29 @@ struct iPow_SIMD<int, int>
         int i = 0;
         v_int32 v_1 = vx_setall_s32(1);
 
-        for ( ; i <= len - v_int32::nlanes*2; i += v_int32::nlanes*2)
+        for ( ; i <= len - VTraits<v_int32>::vlanes()*2; i += VTraits<v_int32>::vlanes()*2)
         {
             v_int32 v_a1 = v_1, v_a2 = v_1;
-            v_int32 v_b1 = vx_load(src + i), v_b2 = vx_load(src + i + v_int32::nlanes);
+            v_int32 v_b1 = vx_load(src + i), v_b2 = vx_load(src + i + VTraits<v_int32>::vlanes());
             int p = power;
 
             while( p > 1 )
             {
                 if (p & 1)
                 {
-                    v_a1 *= v_b1;
-                    v_a2 *= v_b2;
+                    v_a1 = v_mul(v_a1, v_b1);
+                    v_a2 = v_mul(v_a2, v_b2);
                 }
-                v_b1 *= v_b1;
-                v_b2 *= v_b2;
+                v_b1 = v_mul(v_b1, v_b1);
+                v_b2 = v_mul(v_b2, v_b2);
                 p >>= 1;
             }
 
-            v_a1 *= v_b1;
-            v_a2 *= v_b2;
+            v_a1 = v_mul(v_a1, v_b1);
+            v_a2 = v_mul(v_a2, v_b2);
 
             v_store(dst + i, v_a1);
-            v_store(dst + i + v_int32::nlanes, v_a2);
+            v_store(dst + i + VTraits<v_int32>::vlanes(), v_a2);
         }
         vx_cleanup();
 
@@ -949,34 +949,34 @@ struct iPow_SIMD<float, float>
         int i = 0;
         v_float32 v_1 = vx_setall_f32(1.f);
 
-        for ( ; i <= len - v_float32::nlanes*2; i += v_float32::nlanes*2)
+        for ( ; i <= len - VTraits<v_float32>::vlanes()*2; i += VTraits<v_float32>::vlanes()*2)
         {
             v_float32 v_a1 = v_1, v_a2 = v_1;
-            v_float32 v_b1 = vx_load(src + i), v_b2 = vx_load(src + i + v_float32::nlanes);
+            v_float32 v_b1 = vx_load(src + i), v_b2 = vx_load(src + i + VTraits<v_float32>::vlanes());
             int p = std::abs(power);
             if( power < 0 )
             {
-                v_b1 = v_1 / v_b1;
-                v_b2 = v_1 / v_b2;
+                v_b1 = v_div(v_1, v_b1);
+                v_b2 = v_div(v_1, v_b2);
             }
 
             while( p > 1 )
             {
                 if (p & 1)
                 {
-                    v_a1 *= v_b1;
-                    v_a2 *= v_b2;
+                    v_a1 = v_mul(v_a1, v_b1);
+                    v_a2 = v_mul(v_a2, v_b2);
                 }
-                v_b1 *= v_b1;
-                v_b2 *= v_b2;
+                v_b1 = v_mul(v_b1, v_b1);
+                v_b2 = v_mul(v_b2, v_b2);
                 p >>= 1;
             }
 
-            v_a1 *= v_b1;
-            v_a2 *= v_b2;
+            v_a1 = v_mul(v_a1, v_b1);
+            v_a2 = v_mul(v_a2, v_b2);
 
             v_store(dst + i, v_a1);
-            v_store(dst + i + v_float32::nlanes, v_a2);
+            v_store(dst + i + VTraits<v_float32>::vlanes(), v_a2);
         }
         vx_cleanup();
 
@@ -984,7 +984,7 @@ struct iPow_SIMD<float, float>
     }
 };
 
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
 template <>
 struct iPow_SIMD<double, double>
 {
@@ -993,34 +993,34 @@ struct iPow_SIMD<double, double>
         int i = 0;
         v_float64 v_1 = vx_setall_f64(1.);
 
-        for ( ; i <= len - v_float64::nlanes*2; i += v_float64::nlanes*2)
+        for ( ; i <= len - VTraits<v_float64>::vlanes()*2; i += VTraits<v_float64>::vlanes()*2)
         {
             v_float64 v_a1 = v_1, v_a2 = v_1;
-            v_float64 v_b1 = vx_load(src + i), v_b2 = vx_load(src + i + v_float64::nlanes);
+            v_float64 v_b1 = vx_load(src + i), v_b2 = vx_load(src + i + VTraits<v_float64>::vlanes());
             int p = std::abs(power);
             if( power < 0 )
             {
-                v_b1 = v_1 / v_b1;
-                v_b2 = v_1 / v_b2;
+                v_b1 = v_div(v_1, v_b1);
+                v_b2 = v_div(v_1, v_b2);
             }
 
             while( p > 1 )
             {
                 if (p & 1)
                 {
-                    v_a1 *= v_b1;
-                    v_a2 *= v_b2;
+                    v_a1 = v_mul(v_a1, v_b1);
+                    v_a2 = v_mul(v_a2, v_b2);
                 }
-                v_b1 *= v_b1;
-                v_b2 *= v_b2;
+                v_b1 = v_mul(v_b1, v_b1);
+                v_b2 = v_mul(v_b2, v_b2);
                 p >>= 1;
             }
 
-            v_a1 *= v_b1;
-            v_a2 *= v_b2;
+            v_a1 = v_mul(v_a1, v_b1);
+            v_a2 = v_mul(v_a2, v_b2);
 
             v_store(dst + i, v_a1);
-            v_store(dst + i + v_float64::nlanes, v_a2);
+            v_store(dst + i + VTraits<v_float64>::vlanes(), v_a2);
         }
         vx_cleanup();
 
@@ -1614,7 +1614,7 @@ void patchNaNs( InputOutputArray _a, double _val )
     Cv32suf val;
     val.f = (float)_val;
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     v_int32 v_mask1 = vx_setall_s32(0x7fffffff), v_mask2 = vx_setall_s32(0x7f800000);
     v_int32 v_val = vx_setall_s32(val.i);
 #endif
@@ -1624,12 +1624,12 @@ void patchNaNs( InputOutputArray _a, double _val )
         int* tptr = ptrs[0];
         size_t j = 0;
 
-#if CV_SIMD
-        size_t cWidth = (size_t)v_int32::nlanes;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+        size_t cWidth = (size_t)VTraits<v_int32>::vlanes();
         for ( ; j + cWidth <= len; j += cWidth)
         {
             v_int32 v_src = vx_load(tptr + j);
-            v_int32 v_cmp_mask = v_mask2 < (v_src & v_mask1);
+            v_int32 v_cmp_mask = v_lt(v_mask2, v_and(v_src, v_mask1));
             v_int32 v_dst = v_select(v_cmp_mask, v_val, v_src);
             v_store(tptr + j, v_dst);
         }
diff --git a/modules/core/src/matmul.simd.hpp b/modules/core/src/matmul.simd.hpp
index e32096cf71..4ac1e21bb6 100644
--- a/modules/core/src/matmul.simd.hpp
+++ b/modules/core/src/matmul.simd.hpp
@@ -1454,7 +1454,7 @@ transform_( const T* src, T* dst, const WT* m, int len, int scn, int dcn )
 static void
 transform_8u( const uchar* src, uchar* dst, const float* m, int len, int scn, int dcn )
 {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     const int BITS = 10, SCALE = 1 << BITS;
     const float MAX_M = (float)(1 << (15 - BITS));
 
@@ -1485,7 +1485,7 @@ transform_8u( const uchar* src, uchar* dst, const float* m, int len, int scn, in
         v_int32 m10 = vx_setall_s32(m32[4]);
         v_int32 m11 = vx_setall_s32(m32[5]);
         int x = 0;
-        for (; x <= (len - v_uint8::nlanes) * nChannels; x += v_uint8::nlanes * nChannels)
+        for (; x <= (len - VTraits<v_uint8>::vlanes()) * nChannels; x += VTraits<v_uint8>::vlanes() * nChannels)
         {
             v_uint8 b, g, r;
             v_load_deinterleave(src + x, b, g, r);
@@ -1499,20 +1499,20 @@ transform_8u( const uchar* src, uchar* dst, const float* m, int len, int scn, in
             v_int32 p1, p3;
             v_expand(bgl, p0, p2);
             v_expand(v_reinterpret_as_s16(rl), p1, p3);
-            dbl = v_rshr_pack<BITS>(v_dotprod(v_reinterpret_as_s16(p0), m01) + p1 *  m2 + m3,
-                                    v_dotprod(v_reinterpret_as_s16(p2), m01) + p3 *  m2 + m3);
-            dgl = v_rshr_pack<BITS>(v_dotprod(v_reinterpret_as_s16(p0), m45) + p1 *  m6 + m7,
-                                    v_dotprod(v_reinterpret_as_s16(p2), m45) + p3 *  m6 + m7);
-            drl = v_rshr_pack<BITS>(v_dotprod(v_reinterpret_as_s16(p0), m89) + p1 * m10 + m11,
-                                    v_dotprod(v_reinterpret_as_s16(p2), m89) + p3 * m10 + m11);
+            dbl = v_rshr_pack<BITS>(v_add(v_add(v_dotprod(v_reinterpret_as_s16(p0), m01), v_mul(p1, m2)), m3),
+                                    v_add(v_add(v_dotprod(v_reinterpret_as_s16(p2), m01), v_mul(p3, m2)), m3));
+            dgl = v_rshr_pack<BITS>(v_add(v_add(v_dotprod(v_reinterpret_as_s16(p0), m45), v_mul(p1, m6)), m7),
+                                    v_add(v_add(v_dotprod(v_reinterpret_as_s16(p2), m45), v_mul(p3, m6)), m7));
+            drl = v_rshr_pack<BITS>(v_add(v_add(v_dotprod(v_reinterpret_as_s16(p0), m89), v_mul(p1, m10)), m11),
+                                    v_add(v_add(v_dotprod(v_reinterpret_as_s16(p2), m89), v_mul(p3, m10)), m11));
             v_expand(bgh, p0, p2);
             v_expand(v_reinterpret_as_s16(rh), p1, p3);
-            dbh = v_rshr_pack<BITS>(v_dotprod(v_reinterpret_as_s16(p0), m01) + p1 *  m2 + m3,
-                                    v_dotprod(v_reinterpret_as_s16(p2), m01) + p3 *  m2 + m3);
-            dgh = v_rshr_pack<BITS>(v_dotprod(v_reinterpret_as_s16(p0), m45) + p1 *  m6 + m7,
-                                    v_dotprod(v_reinterpret_as_s16(p2), m45) + p3 *  m6 + m7);
-            drh = v_rshr_pack<BITS>(v_dotprod(v_reinterpret_as_s16(p0), m89) + p1 * m10 + m11,
-                                    v_dotprod(v_reinterpret_as_s16(p2), m89) + p3 * m10 + m11);
+            dbh = v_rshr_pack<BITS>(v_add(v_add(v_dotprod(v_reinterpret_as_s16(p0), m01), v_mul(p1, m2)), m3),
+                                    v_add(v_add(v_dotprod(v_reinterpret_as_s16(p2), m01), v_mul(p3, m2)), m3));
+            dgh = v_rshr_pack<BITS>(v_add(v_add(v_dotprod(v_reinterpret_as_s16(p0), m45), v_mul(p1, m6)), m7),
+                                    v_add(v_add(v_dotprod(v_reinterpret_as_s16(p2), m45), v_mul(p3, m6)), m7));
+            drh = v_rshr_pack<BITS>(v_add(v_add(v_dotprod(v_reinterpret_as_s16(p0), m89), v_mul(p1, m10)), m11),
+                                    v_add(v_add(v_dotprod(v_reinterpret_as_s16(p2), m89), v_mul(p3, m10)), m11));
             v_store_interleave(dst + x, v_pack_u(dbl, dbh), v_pack_u(dgl, dgh), v_pack_u(drl, drh));
         }
         m32[1] = saturate_cast<int>((m[3] + 0.5f)*SCALE);
@@ -1537,7 +1537,7 @@ transform_8u( const uchar* src, uchar* dst, const float* m, int len, int scn, in
 static void
 transform_16u( const ushort* src, ushort* dst, const float* m, int len, int scn, int dcn )
 {
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     if( scn == 3 && dcn == 3 )
     {
         int x = 0;
@@ -1555,7 +1555,7 @@ transform_16u( const ushort* src, ushort* dst, const float* m, int len, int scn,
         v_float32 m10 = vx_setall_f32(m[10]);
         v_float32 m11 = vx_setall_f32(m[11] - 32768.f);
         v_int16 delta = vx_setall_s16(-32768);
-        for (; x <= (len - v_uint16::nlanes)*3; x += v_uint16::nlanes*3)
+        for (; x <= (len - VTraits<v_uint16>::vlanes())*3; x +=  VTraits<v_uint16>::vlanes()*3)
         {
             v_uint16 b, g, r;
             v_load_deinterleave(src + x, b, g, r);
@@ -1574,6 +1574,7 @@ transform_16u( const ushort* src, ushort* dst, const float* m, int len, int scn,
             v_store_interleave(dst + x, v_reinterpret_as_u16(db), v_reinterpret_as_u16(dg), v_reinterpret_as_u16(dr));
         }
 #endif
+#if CV_SIMD128
         v_float32x4 _m0l(m[0], m[4], m[ 8], 0.f);
         v_float32x4 _m1l(m[1], m[5], m[ 9], 0.f);
         v_float32x4 _m2l(m[2], m[6], m[10], 0.f);
@@ -1587,6 +1588,7 @@ transform_16u( const ushort* src, ushort* dst, const float* m, int len, int scn,
             v_store(dst + x, v_rotate_right<1>(v_reinterpret_as_u16(v_add_wrap(v_pack(
                              v_round(v_matmuladd(v_cvt_f32(v_reinterpret_as_s32(v_load_expand(src + x    ))), _m0h, _m1h, _m2h, _m3h)),
                              v_round(v_matmuladd(v_cvt_f32(v_reinterpret_as_s32(v_load_expand(src + x + 3))), _m0l, _m1l, _m2l, _m3l))), _delta))));
+#endif //CV_SIMD128
         for( ; x < len * 3; x += 3 )
         {
             float v0 = src[x], v1 = src[x + 1], v2 = src[x + 2];
@@ -1606,25 +1608,25 @@ transform_16u( const ushort* src, ushort* dst, const float* m, int len, int scn,
 static void
 transform_32f( const float* src, float* dst, const float* m, int len, int scn, int dcn )
 {
-#if CV_SIMD && !defined(__aarch64__) && !defined(_M_ARM64)
+#if (CV_SIMD || CV_SIMD_SCALABLE) && !defined(__aarch64__) && !defined(_M_ARM64)
     int x = 0;
     if( scn == 3 && dcn == 3 )
     {
-        int idx[v_float32::nlanes/2];
-        for( int i = 0; i < v_float32::nlanes/4; i++ )
+        int idx[VTraits<v_float32>::max_nlanes/2];
+        for( int i = 0; i < VTraits<v_float32>::vlanes()/4; i++ )
         {
             idx[i] = 3*i;
-            idx[i + v_float32::nlanes/4] = 0;
+            idx[i + VTraits<v_float32>::vlanes()/4] = 0;
         }
         float _m[] = { m[0], m[4], m[ 8], 0.f,
                        m[1], m[5], m[ 9], 0.f,
                        m[2], m[6], m[10], 0.f,
                        m[3], m[7], m[11], 0.f };
-        v_float32 m0 = vx_lut_quads(_m     , idx + v_float32::nlanes/4);
-        v_float32 m1 = vx_lut_quads(_m +  4, idx + v_float32::nlanes/4);
-        v_float32 m2 = vx_lut_quads(_m +  8, idx + v_float32::nlanes/4);
-        v_float32 m3 = vx_lut_quads(_m + 12, idx + v_float32::nlanes/4);
-        for( ; x <= len*3 - v_float32::nlanes; x += 3*v_float32::nlanes/4 )
+        v_float32 m0 = vx_lut_quads(_m     , idx + VTraits<v_float32>::vlanes()/4);
+        v_float32 m1 = vx_lut_quads(_m +  4, idx + VTraits<v_float32>::vlanes()/4);
+        v_float32 m2 = vx_lut_quads(_m +  8, idx + VTraits<v_float32>::vlanes()/4);
+        v_float32 m3 = vx_lut_quads(_m + 12, idx + VTraits<v_float32>::vlanes()/4);
+        for( ; x <= len*3 - VTraits<v_float32>::vlanes(); x += 3*VTraits<v_float32>::vlanes()/4 )
             v_store(dst + x, v_pack_triplets(v_matmuladd(vx_lut_quads(src + x, idx), m0, m1, m2, m3)));
         for( ; x < len*3; x += 3 )
         {
@@ -1641,8 +1643,8 @@ transform_32f( const float* src, float* dst, const float* m, int len, int scn, i
     if( scn == 4 && dcn == 4 )
     {
 #if CV_SIMD_WIDTH > 16
-        int idx[v_float32::nlanes/4];
-        for( int i = 0; i < v_float32::nlanes/4; i++ )
+        int idx[VTraits<v_float32>::max_nlanes/4];
+        for( int i = 0; i < VTraits<v_float32>::vlanes()/4; i++ )
             idx[i] = 0;
         float _m[] = { m[4], m[9], m[14], m[19] };
         v_float32 m0 = vx_lut_quads(m   , idx);
@@ -1650,12 +1652,13 @@ transform_32f( const float* src, float* dst, const float* m, int len, int scn, i
         v_float32 m2 = vx_lut_quads(m+10, idx);
         v_float32 m3 = vx_lut_quads(m+15, idx);
         v_float32 m4 = vx_lut_quads(_m, idx);
-        for( ; x <= len*4 - v_float32::nlanes; x += v_float32::nlanes )
+        for( ; x <= len*4 - VTraits<v_float32>::vlanes(); x += VTraits<v_float32>::vlanes() )
         {
             v_float32 v_src = vx_load(src + x);
-            v_store(dst + x, v_reduce_sum4(v_src * m0, v_src * m1, v_src * m2, v_src * m3) + m4);
+            v_store(dst + x, v_add(v_reduce_sum4(v_mul(v_src, m0), v_mul(v_src, m1), v_mul(v_src, m2), v_mul(v_src, m3)), m4));
         }
 #endif
+#if CV_SIMD128
         v_float32x4 _m0 = v_load(m     );
         v_float32x4 _m1 = v_load(m +  5);
         v_float32x4 _m2 = v_load(m + 10);
@@ -1666,6 +1669,17 @@ transform_32f( const float* src, float* dst, const float* m, int len, int scn, i
             v_float32x4 v_src = v_load(src + x);
             v_store(dst + x, v_reduce_sum4(v_src * _m0, v_src * _m1, v_src * _m2, v_src * _m3) + _m4);
         }
+#else // CV_SIMD_WIDTH >= 16 && !CV_SIMD128
+        for( ; x < len*4; x += 4 )
+        {
+            float v0 = src[x], v1 = src[x+1], v2 = src[x+2], v3 = src[x+3];
+            float t0 = saturate_cast<float>(m[0]*v0 + m[1]*v1 + m[ 2]*v2 + m[ 3]*v3 + m[ 4]);
+            float t1 = saturate_cast<float>(m[5]*v0 + m[6]*v1 + m[ 7]*v2 + m[ 8]*v3 + m[ 9]);
+            float t2 = saturate_cast<float>(m[10]*v0 + m[11]*v1 + m[12]*v2 + m[13]*v3 + m[14]);
+            float t3 = saturate_cast<float>(m[15]*v0 + m[16]*v1 + m[17]*v2 + m[18]*v3 + m[19]);
+            dst[x] = t0; dst[x+1] = t1; dst[x+2] = t2; dst[x+3] = t3;
+        }
+#endif
         vx_cleanup();
         return;
     }
@@ -1936,9 +1950,9 @@ static void scaleAdd_32f(const float* src1, const float* src2, float* dst,
 {
     float alpha = *_alpha;
     int i = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     v_float32 v_alpha = vx_setall_f32(alpha);
-    const int cWidth = v_float32::nlanes;
+    const int cWidth = VTraits<v_float32>::vlanes();
     for (; i <= len - cWidth; i += cWidth)
         v_store(dst + i, v_muladd(vx_load(src1 + i), v_alpha, vx_load(src2 + i)));
     vx_cleanup();
@@ -1953,9 +1967,9 @@ static void scaleAdd_64f(const double* src1, const double* src2, double* dst,
 {
     double alpha = *_alpha;
     int i = 0;
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
     v_float64 a2 = vx_setall_f64(alpha);
-    const int cWidth = v_float64::nlanes;
+    const int cWidth = VTraits<v_float64>::vlanes();
     for (; i <= len - cWidth; i += cWidth)
         v_store(dst + i, v_muladd(vx_load(src1 + i), a2, vx_load(src2 + i)));
     vx_cleanup();
@@ -2078,7 +2092,7 @@ MulTransposedR(const Mat& srcmat, const Mat& dstmat, const Mat& deltamat, double
         deltastep = deltastep ? 4 : 0;
     }
 
-#if CV_SIMD_64F
+#if CV_SIMD128_64F
     v_float64x2 v_scale = v_setall_f64(scale);
 #endif
 
@@ -2090,7 +2104,7 @@ MulTransposedR(const Mat& srcmat, const Mat& dstmat, const Mat& deltamat, double
 
             for( j = i; j <= size.width - 4; j += 4 )
             {
-#if CV_SIMD_64F
+#if CV_SIMD128_64F
                 if (DataType<sT>::depth == CV_64F && DataType<dT>::depth == CV_64F)
                 {
                     v_float64x2 s0 = v_setzero_f64(), s1 = v_setzero_f64();
@@ -2150,7 +2164,7 @@ MulTransposedR(const Mat& srcmat, const Mat& dstmat, const Mat& deltamat, double
 
             for( j = i; j <= size.width - 4; j += 4 )
             {
-#if CV_SIMD_64F
+#if CV_SIMD128_64F
                 if (DataType<sT>::depth == CV_64F && DataType<dT>::depth == CV_64F)
                 {
                     v_float64x2 s0 = v_setzero_f64(), s1 = v_setzero_f64();
@@ -2227,7 +2241,7 @@ MulTransposedL(const Mat& srcmat, const Mat& dstmat, const Mat& deltamat, double
                 double s = 0;
                 const sT *tsrc1 = src + i*srcstep;
                 const sT *tsrc2 = src + j*srcstep;
-#if CV_SIMD_64F
+#if CV_SIMD128_64F
                 if (DataType<sT>::depth == CV_64F && DataType<dT>::depth == CV_64F)
                 {
                     const double *v_tsrc1 = (double *)(tsrc1);
@@ -2280,7 +2294,7 @@ MulTransposedL(const Mat& srcmat, const Mat& dstmat, const Mat& deltamat, double
                         delta_buf[2] = delta_buf[3] = tdelta2[0];
                     tdelta2 = delta_buf;
                 }
-#if CV_SIMD_64F
+#if CV_SIMD128_64F
                 if (DataType<sT>::depth == CV_64F && DataType<dT>::depth == CV_64F)
                 {
                     const double *v_tsrc2 = (double *)(tsrc2);
@@ -2393,14 +2407,14 @@ double dotProd_8u(const uchar* src1, const uchar* src2, int len)
     double r = 0;
     int i = 0;
 
-#if CV_SIMD
-    int len0 = len & -v_uint16::nlanes, blockSize0 = (1 << 15), blockSize;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    int len0 = len & -VTraits<v_uint16>::vlanes(), blockSize0 = (1 << 15), blockSize;
 
     while (i < len0)
     {
         blockSize = std::min(len0 - i, blockSize0);
         v_uint32 v_sum = vx_setzero_u32();
-        const int cWidth = v_uint16::nlanes;
+        const int cWidth = VTraits<v_uint16>::vlanes();
 
         int j = 0;
         for (; j <= blockSize - cWidth * 2; j += cWidth * 2)
@@ -2414,7 +2428,7 @@ double dotProd_8u(const uchar* src1, const uchar* src2, int len)
         {
             v_int16 v_src10 = v_reinterpret_as_s16(vx_load_expand(src1 + j));
             v_int16 v_src20 = v_reinterpret_as_s16(vx_load_expand(src2 + j));
-            v_sum += v_reinterpret_as_u32(v_dotprod_fast(v_src10, v_src20));
+            v_sum = v_add(v_sum, v_reinterpret_as_u32(v_dotprod_fast(v_src10, v_src20)));
         }
         r += (double)v_reduce_sum(v_sum);
 
@@ -2433,14 +2447,14 @@ double dotProd_8s(const schar* src1, const schar* src2, int len)
     double r = 0.0;
     int i = 0;
 
-#if CV_SIMD
-    int len0 = len & -v_int16::nlanes, blockSize0 = (1 << 14), blockSize;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    int len0 = len & -VTraits<v_int16>::vlanes(), blockSize0 = (1 << 14), blockSize;
 
     while (i < len0)
     {
         blockSize = std::min(len0 - i, blockSize0);
         v_int32 v_sum = vx_setzero_s32();
-        const int cWidth = v_int16::nlanes;
+        const int cWidth = VTraits<v_int16>::vlanes();
 
         int j = 0;
         for (; j <= blockSize - cWidth * 2; j += cWidth * 2)
@@ -2473,14 +2487,14 @@ double dotProd_16u(const ushort* src1, const ushort* src2, int len)
     double r = 0.0;
     int i = 0;
 
-#if CV_SIMD
-    int len0 = len & -v_uint16::nlanes, blockSize0 = (1 << 24), blockSize;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    int len0 = len & -VTraits<v_uint16>::vlanes(), blockSize0 = (1 << 24), blockSize;
 
     while (i < len0)
     {
         blockSize = std::min(len0 - i, blockSize0);
         v_uint64 v_sum = vx_setzero_u64();
-        const int cWidth = v_uint16::nlanes;
+        const int cWidth = VTraits<v_uint16>::vlanes();
 
         int j = 0;
         for (; j <= blockSize - cWidth; j += cWidth)
@@ -2505,14 +2519,14 @@ double dotProd_16s(const short* src1, const short* src2, int len)
     double r = 0.0;
     int i = 0;
 
-#if CV_SIMD
-    int len0 = len & -v_int16::nlanes, blockSize0 = (1 << 24), blockSize;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    int len0 = len & -VTraits<v_int16>::vlanes(), blockSize0 = (1 << 24), blockSize;
 
     while (i < len0)
     {
         blockSize = std::min(len0 - i, blockSize0);
         v_int64 v_sum = vx_setzero_s64();
-        const int cWidth = v_int16::nlanes;
+        const int cWidth = VTraits<v_int16>::vlanes();
 
         int j = 0;
         for (; j <= blockSize - cWidth; j += cWidth)
@@ -2534,10 +2548,10 @@ double dotProd_16s(const short* src1, const short* src2, int len)
 
 double dotProd_32s(const int* src1, const int* src2, int len)
 {
-#if CV_SIMD_64F
+#if CV_SIMD_64F // TODO: enable for CV_SIMD_SCALABLE_64F
     double r = .0;
     int i = 0;
-    const int step  = v_int32::nlanes;
+    const int step  = VTraits<v_int32>::vlanes();
     v_float64 v_sum0 = vx_setzero_f64();
 #if CV_SIMD_WIDTH == 16
     const int wstep = step * 2;
@@ -2572,8 +2586,8 @@ double dotProd_32f(const float* src1, const float* src2, int len)
     double r = 0.0;
     int i = 0;
 
-#if CV_SIMD
-    int len0 = len & -v_float32::nlanes, blockSize0 = (1 << 13), blockSize;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    int len0 = len & -VTraits<v_float32>::vlanes(), blockSize0 = (1 << 13), blockSize;
 
     while (i < len0)
     {
@@ -2581,7 +2595,7 @@ double dotProd_32f(const float* src1, const float* src2, int len)
         v_float32 v_sum = vx_setzero_f32();
 
         int j = 0;
-        int cWidth = v_float32::nlanes;
+        int cWidth = VTraits<v_float32>::vlanes();
 
 #if CV_ENABLE_UNROLLED
         v_float32 v_sum1 = vx_setzero_f32();
@@ -2600,7 +2614,7 @@ double dotProd_32f(const float* src1, const float* src2, int len)
                               vx_load(src2 + j + (cWidth * 3)), v_sum3);
         }
 
-        v_sum += v_sum1 + v_sum2 + v_sum3;
+        v_sum = v_add(v_sum, v_add(v_add(v_sum1, v_sum2), v_sum3));
 #endif
 
         for (; j <= blockSize - cWidth; j += cWidth)
diff --git a/modules/core/src/matrix_transform.cpp b/modules/core/src/matrix_transform.cpp
index 7f1043fbbe..43bf9be057 100644
--- a/modules/core/src/matrix_transform.cpp
+++ b/modules/core/src/matrix_transform.cpp
@@ -7,6 +7,7 @@
 #include "opencv2/core/detail/dispatch_helper.impl.hpp"
 
 #include <algorithm> // std::swap_ranges
+#include <numeric> // std::accumulate
 
 namespace cv {
 
@@ -440,7 +441,7 @@ template<typename T1, typename T2> CV_ALWAYS_INLINE void flipHoriz_double( const
 static void
 flipHoriz( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size, size_t esz )
 {
-#if CV_SIMD
+#if CV_SIMD128
 #if CV_STRONG_ALIGNMENT
     size_t alignmentMark = ((size_t)src)|((size_t)dst)|sstep|dstep;
 #endif
@@ -563,7 +564,7 @@ flipHoriz( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size,
     }
 #endif
     else
-#endif // CV_SIMD
+#endif // CV_SIMD128
     {
         int i, j, limit = (int)(((size.width + 1)/2)*esz);
         AutoBuffer<int> _tab(size.width*esz);
@@ -596,12 +597,12 @@ flipVert( const uchar* src0, size_t sstep, uchar* dst0, size_t dstep, Size size,
                                                   dst0 += dstep, dst1 -= dstep )
     {
         int i = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 #if CV_STRONG_ALIGNMENT
         if (isAligned<sizeof(int)>(src0, src1, dst0, dst1))
 #endif
         {
-            for (; i <= size.width - CV_SIMD_WIDTH; i += CV_SIMD_WIDTH)
+            for (; i <= size.width - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes())
             {
                 v_int32 t0 = v_reinterpret_as_s32(vx_load(src0 + i));
                 v_int32 t1 = v_reinterpret_as_s32(vx_load(src1 + i));
@@ -612,7 +613,7 @@ flipVert( const uchar* src0, size_t sstep, uchar* dst0, size_t dstep, Size size,
 #if CV_STRONG_ALIGNMENT
         else
         {
-            for (; i <= size.width - CV_SIMD_WIDTH; i += CV_SIMD_WIDTH)
+            for (; i <= size.width - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes())
             {
                 v_uint8 t0 = vx_load(src0 + i);
                 v_uint8 t1 = vx_load(src1 + i);
@@ -857,6 +858,223 @@ void flipND(InputArray _src, OutputArray _dst, int _axis)
     flipNDImpl(dst.ptr(), dst.size.p, dst.step.p, axis);
 }
 
+/*
+    This function first prepends 1 to each tensor shape to have a common max_ndims dimension, then flatten non-broadcast dimensions.
+*/
+static bool _flatten_for_broadcast(int narrays, int max_ndims, const int* ndims, const int** orig_shape,
+                                   int** flatten_shape, size_t** flatten_step) {
+    int i, j, k;
+
+    // step 1.
+    // * make all inputs and the output max_ndims-dimensional.
+    // * compute proper step's
+    for (i = max_ndims - 1; i >= 0; i-- ) {
+        for (k = 0; k < narrays; k++) {
+            j = ndims[k] - (max_ndims - i);
+            int sz_i = j >= 0 ? orig_shape[k][j] : 1;
+            size_t st_i = i == max_ndims - 1 ? 1 : flatten_step[k][i+1] * flatten_shape[k][i+1];
+            flatten_shape[k][i] = sz_i;
+            flatten_step[k][i] = st_i;
+            if (flatten_shape[k][i] == 0)
+                return false;
+        }
+    }
+
+    // step 2. Let's do the flattening first,
+    // since we'd need proper values of steps to check continuity.
+    // this loop is probably the most tricky part
+    // in the whole implementation of broadcasting.
+    j = max_ndims-1;
+    for (i = j - 1; i >= 0; i--) {
+        bool all_contiguous = true, all_scalars = true, all_consistent = true;
+        for(k = 0; k < narrays; k++) {
+            size_t st = flatten_step[k][j] * flatten_shape[k][j];
+            bool prev_scalar = flatten_shape[k][j] == 1;
+            bool scalar = flatten_shape[k][i] == 1;
+            all_contiguous = all_contiguous && (st == flatten_step[k][i]);
+            all_scalars = all_scalars && scalar;
+            all_consistent = all_consistent && (scalar == prev_scalar);
+        }
+        if (all_contiguous && (all_consistent || all_scalars)) {
+            for(k = 0; k < narrays; k++)
+                flatten_shape[k][j] *= flatten_shape[k][i];
+        } else {
+            j--;
+            if (i < j) {
+                for(k = 0; k < narrays; k++) {
+                    flatten_shape[k][j] = flatten_shape[k][i];
+                    flatten_step[k][j] = flatten_step[k][i];
+                }
+            }
+        }
+    }
+
+    // step 3. Set some step's to 0's.
+    for (i = max_ndims-1; i >= j; i--) {
+        for (k = 0; k < narrays; k++)
+            flatten_step[k][i] = flatten_shape[k][i] == 1 ? 0 : flatten_step[k][i];
+    }
+    for (; i >= 0; i--) {
+        for (k = 0; k < narrays; k++) {
+            flatten_step[k][i] = 0;
+            flatten_shape[k][i] = 1;
+        }
+    }
+    return true;
+}
+
+void broadcast(InputArray _src, InputArray _shape, OutputArray _dst) {
+    CV_INSTRUMENT_REGION();
+
+    Mat src = _src.getMat();
+    CV_CheckTrue(src.isContinuous(), "broadcast: input array must be contiguous");
+    CV_CheckChannelsEQ(src.channels(), 1, "broadcast: input array must be single channel");
+
+    Mat shape = _shape.getMat();
+    CV_CheckTypeEQ(shape.type(), CV_32S, "broadcast: target shape must be of type int32");
+    const auto dims_shape = static_cast<int>(shape.total());
+    const auto *ptr_shape = shape.ptr<int>();
+
+    // check valid shape, 1D/0D Mat would fail in the following checks
+    const auto dims_src = src.dims;
+    CV_CheckLE(dims_src, dims_shape,
+               "broadcast: dimension of input array must be less than or equal to dimension of target shape");
+    std::vector<int> shape_src{src.size.p, src.size.p + dims_src};
+    if (shape_src.size() < static_cast<size_t>(dims_shape)) {
+        shape_src.insert(shape_src.begin(), dims_shape - shape_src.size(), 1);
+    }
+    for (int i = 0; i < static_cast<int>(shape_src.size()); ++i) {
+        const auto *shape_target = ptr_shape;
+        if (shape_src[i] != 1) {
+            CV_CheckEQ(shape_src[i], shape_target[i], "target shape must be equal to input shape or 1");
+        }
+    }
+
+    // impl
+    _dst.create(dims_shape, shape.ptr<int>(), src.type());
+    Mat dst = _dst.getMat();
+    std::vector<int> is_same_shape(dims_shape, 0);
+    for (int i = 0; i < static_cast<int>(shape_src.size()); ++i) {
+        if (shape_src[i] == ptr_shape[i]) {
+            is_same_shape[i] = 1;
+        }
+    }
+    // copy if same shape
+    if (std::accumulate(is_same_shape.begin(), is_same_shape.end(), 1, std::multiplies<int>()) != 0) {
+        const auto *p_src = src.ptr<const char>();
+        auto *p_dst = dst.ptr<char>();
+        std::memcpy(p_dst, p_src, dst.total() * dst.elemSize());
+        return;
+    }
+    // other cases
+    int max_ndims = std::max(dims_src, dims_shape);
+    const int all_ndims[2] = {src.dims, dst.dims};
+    const int* orig_shapes[2] = {src.size.p, dst.size.p};
+    cv::AutoBuffer<size_t> buff(max_ndims * 4);
+    int* flatten_shapes[2] = {(int*)buff.data(), (int*)(buff.data() + max_ndims)};
+    size_t* flatten_steps[2] = {(size_t*)(buff.data() + 2 * max_ndims), (size_t*)(buff.data() + 3 * max_ndims)};
+    if (_flatten_for_broadcast(2, max_ndims, all_ndims, orig_shapes, flatten_shapes, flatten_steps)) {
+        size_t src_dp = flatten_steps[0][max_ndims - 1];
+        size_t dst_dp = flatten_steps[1][max_ndims - 1];
+        CV_Assert(dst_dp == 1);
+        CV_Assert(max_ndims >= 2); // >= 3?
+        size_t rowstep_src = flatten_steps[0][max_ndims - 2];
+        size_t rowstep_dst = flatten_steps[1][max_ndims - 2];
+        const char* ptr_src = src.ptr<const char>();
+        char* ptr_dst = dst.ptr<char>();
+        size_t esz = src.elemSize();
+        int nrows = flatten_shapes[1][max_ndims - 2];
+        int ncols = flatten_shapes[1][max_ndims - 1];
+        int nplanes = 1;
+        CV_Check(esz, esz == 1 || esz == 2 || esz == 4 || esz == 8, "broadcast: not supported data type");
+
+        for (int k = 0; k < max_ndims - 2; k++) {
+            nplanes *= flatten_shapes[1][k];
+        }
+        for (int plane_idx = 0; plane_idx < nplanes; plane_idx++) {
+            size_t offset_src = 0, offset_dst = 0;
+            size_t idx = (size_t)plane_idx;
+            for (int k = max_ndims - 3; k >= 0; k--) {
+                size_t prev_idx = idx / flatten_shapes[1][k];
+                size_t i_k = (int)(idx - prev_idx * flatten_shapes[1][k]);
+                offset_src += i_k * flatten_steps[0][k];
+                offset_dst += i_k * flatten_steps[1][k];
+                idx = prev_idx;
+            }
+
+            #define OPENCV_CORE_BROADCAST_LOOP(_Tp) \
+                for (int i = 0; i < nrows; i++) {   \
+                    const _Tp *ptr_src_ = (const _Tp*)ptr_src + offset_src + rowstep_src * i; \
+                    _Tp *ptr_dst_ = (_Tp*)ptr_dst + offset_dst + rowstep_dst * i; \
+                    if (src_dp == 1) { \
+                        for (int j = 0; j < ncols; j++) { \
+                            ptr_dst_[j] = ptr_src_[j]; \
+                        } \
+                    } else { \
+                        _Tp x = *ptr_src_; \
+                        for (int j = 0; j < ncols; j++) { \
+                            ptr_dst_[j] = x; \
+                        } \
+                    } \
+                }
+
+            if (esz == 1) {
+                OPENCV_CORE_BROADCAST_LOOP(int8_t);
+            } else if (esz == 2) {
+                OPENCV_CORE_BROADCAST_LOOP(int16_t);
+            } else if (esz == 4) {
+                OPENCV_CORE_BROADCAST_LOOP(int32_t);
+            } else if (esz == 8) {
+                OPENCV_CORE_BROADCAST_LOOP(int64_t);
+            } else {
+                CV_Error(cv::Error::StsNotImplemented, "");
+            }
+            #undef OPENCV_CORE_BROADCAST_LOOP
+        }
+    } else {
+        // initial copy (src to dst)
+        std::vector<size_t> step_src{src.step.p, src.step.p + dims_src};
+        if (step_src.size() < static_cast<size_t>(dims_shape)) {
+            step_src.insert(step_src.begin(), dims_shape - step_src.size(), step_src[0]);
+        }
+        for (size_t i = 0; i < src.total(); ++i) {
+            size_t t = i;
+            size_t src_offset = 0, dst_offset = 0;
+            for (int j = static_cast<int>(shape_src.size() - 1); j >= 0; --j) {
+                size_t idx = t / shape_src[j];
+                size_t offset = static_cast<size_t>(t - idx * shape_src[j]);
+                src_offset += offset * step_src[j];
+                dst_offset += offset * dst.step[j];
+                t = idx;
+            }
+            const auto *p_src = src.ptr<const char>();
+            auto *p_dst = dst.ptr<char>();
+            std::memcpy(p_dst + dst_offset, p_src + src_offset, dst.elemSize());
+        }
+        // broadcast copy (dst inplace)
+        std::vector<int> cumulative_shape(dims_shape, 1);
+        int total = static_cast<int>(dst.total());
+        for (int i = dims_shape - 1; i >= 0; --i) {
+            cumulative_shape[i] = static_cast<int>(total / ptr_shape[i]);
+            total = cumulative_shape[i];
+        }
+        for (int i = dims_shape - 1; i >= 0; --i) {
+            if (is_same_shape[i] == 1) {
+                continue;
+            }
+            auto step = dst.step[i];
+            auto *p_dst = dst.ptr<char>();
+            for (int j = 0; j < cumulative_shape[i]; j++) {
+                for (int k = 0; k < ptr_shape[i] - 1; k++) {
+                    std::memcpy(p_dst + step, p_dst, step);
+                    p_dst += step;
+                }
+                p_dst += step;
+            }
+        }
+    }
+}
+
 void rotate(InputArray _src, OutputArray _dst, int rotateMode)
 {
     CV_Assert(_src.dims() <= 2);
diff --git a/modules/core/src/mean.dispatch.cpp b/modules/core/src/mean.dispatch.cpp
index 6a5275ab43..0f94e5421a 100644
--- a/modules/core/src/mean.dispatch.cpp
+++ b/modules/core/src/mean.dispatch.cpp
@@ -8,20 +8,24 @@
 #include "opencv2/core/openvx/ovx_defs.hpp"
 #include "stat.hpp"
 
+#ifndef OPENCV_IPP_MEAN
 #undef HAVE_IPP
 #undef CV_IPP_RUN_FAST
 #define CV_IPP_RUN_FAST(f, ...)
 #undef CV_IPP_RUN
 #define CV_IPP_RUN(c, f, ...)
+#endif // OPENCV_IPP_MEAN
 
 #include "mean.simd.hpp"
 #include "mean.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content
 
+#ifndef OPENCV_IPP_MEAN
 #undef HAVE_IPP
 #undef CV_IPP_RUN_FAST
 #define CV_IPP_RUN_FAST(f, ...)
 #undef CV_IPP_RUN
 #define CV_IPP_RUN(c, f, ...)
+#endif // OPENCV_IPP_MEAN
 
 namespace cv {
 
diff --git a/modules/core/src/merge.dispatch.cpp b/modules/core/src/merge.dispatch.cpp
index 6b8c2d8135..bd7a936cf9 100644
--- a/modules/core/src/merge.dispatch.cpp
+++ b/modules/core/src/merge.dispatch.cpp
@@ -121,6 +121,7 @@ void merge(const Mat* mv, size_t n, OutputArray _dst)
     CV_INSTRUMENT_REGION();
 
     CV_Assert( mv && n > 0 );
+    CV_Assert(!mv[0].empty());
 
     int depth = mv[0].depth();
     bool allch1 = true;
diff --git a/modules/core/src/merge.simd.hpp b/modules/core/src/merge.simd.hpp
index ad08dd8879..d67a117c7b 100644
--- a/modules/core/src/merge.simd.hpp
+++ b/modules/core/src/merge.simd.hpp
@@ -15,7 +15,7 @@ void merge64s(const int64** src, int64* dst, int len, int cn);
 
 #ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 /*
   The trick with STORE_UNALIGNED/STORE_ALIGNED_NOCACHE is the following:
   on IA there are instructions movntps and such to which
@@ -38,7 +38,7 @@ void merge64s(const int64** src, int64* dst, int len, int cn);
 template<typename T, typename VecT> static void
 vecmerge_( const T** src, T* dst, int len, int cn )
 {
-    const int VECSZ = VecT::nlanes;
+    const int VECSZ = VTraits<VecT>::vlanes();
     int i, i0 = 0;
     const T* src0 = src[0];
     const T* src1 = src[1];
@@ -173,8 +173,8 @@ merge_( const T** src, T* dst, int len, int cn )
 void merge8u(const uchar** src, uchar* dst, int len, int cn )
 {
     CV_INSTRUMENT_REGION();
-#if CV_SIMD
-    if( len >= v_uint8::nlanes && 2 <= cn && cn <= 4 )
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    if( len >= VTraits<v_uint8>::vlanes() && 2 <= cn && cn <= 4 )
         vecmerge_<uchar, v_uint8>(src, dst, len, cn);
     else
 #endif
@@ -184,8 +184,8 @@ void merge8u(const uchar** src, uchar* dst, int len, int cn )
 void merge16u(const ushort** src, ushort* dst, int len, int cn )
 {
     CV_INSTRUMENT_REGION();
-#if CV_SIMD
-    if( len >= v_uint16::nlanes && 2 <= cn && cn <= 4 )
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    if( len >= VTraits<v_uint16>::vlanes() && 2 <= cn && cn <= 4 )
         vecmerge_<ushort, v_uint16>(src, dst, len, cn);
     else
 #endif
@@ -195,8 +195,8 @@ void merge16u(const ushort** src, ushort* dst, int len, int cn )
 void merge32s(const int** src, int* dst, int len, int cn )
 {
     CV_INSTRUMENT_REGION();
-#if CV_SIMD
-    if( len >= v_int32::nlanes && 2 <= cn && cn <= 4 )
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    if( len >= VTraits<v_int32>::vlanes() && 2 <= cn && cn <= 4 )
         vecmerge_<int, v_int32>(src, dst, len, cn);
     else
 #endif
@@ -206,8 +206,8 @@ void merge32s(const int** src, int* dst, int len, int cn )
 void merge64s(const int64** src, int64* dst, int len, int cn )
 {
     CV_INSTRUMENT_REGION();
-#if CV_SIMD
-    if( len >= v_int64::nlanes && 2 <= cn && cn <= 4 )
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    if( len >= VTraits<v_int64>::vlanes() && 2 <= cn && cn <= 4 )
         vecmerge_<int64, v_int64>(src, dst, len, cn);
     else
 #endif
diff --git a/modules/core/src/minmax.cpp b/modules/core/src/minmax.cpp
index d694d99d3c..d4328f5070 100644
--- a/modules/core/src/minmax.cpp
+++ b/modules/core/src/minmax.cpp
@@ -11,11 +11,13 @@
 
 #include <algorithm>
 
+#ifndef OPENCV_IPP_MINMAX
 #undef HAVE_IPP
 #undef CV_IPP_RUN_FAST
 #define CV_IPP_RUN_FAST(f, ...)
 #undef CV_IPP_RUN
 #define CV_IPP_RUN(c, f, ...)
+#endif // OPENCV_IPP_MINMAX
 
 #define IPP_DISABLE_MINMAXIDX_MANY_ROWS 1  // see Core_MinMaxIdx.rows_overflow test
 
diff --git a/modules/core/src/norm.cpp b/modules/core/src/norm.cpp
index 72d6fd9abc..931d6f02b8 100644
--- a/modules/core/src/norm.cpp
+++ b/modules/core/src/norm.cpp
@@ -63,25 +63,25 @@ int normHamming(const uchar* a, int n, int cellSize)
         return -1;
     int i = 0;
     int result = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     v_uint64 t = vx_setzero_u64();
     if ( cellSize == 2)
     {
         v_uint16 mask = v_reinterpret_as_u16(vx_setall_u8(0x55));
-        for(; i <= n - v_uint8::nlanes; i += v_uint8::nlanes)
+        for(; i <= n - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes())
         {
             v_uint16 a0 = v_reinterpret_as_u16(vx_load(a + i));
-            t += v_popcount(v_reinterpret_as_u64((a0 | (a0 >> 1)) & mask));
+            t = v_add(t, v_popcount(v_reinterpret_as_u64(v_and(v_or(a0, v_shr<1>(a0)), mask))));
         }
     }
     else    // cellSize == 4
     {
         v_uint16 mask = v_reinterpret_as_u16(vx_setall_u8(0x11));
-        for(; i <= n - v_uint8::nlanes; i += v_uint8::nlanes)
+        for(; i <= n - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes())
         {
             v_uint16 a0 = v_reinterpret_as_u16(vx_load(a + i));
-            v_uint16 a1 = a0 | (a0 >> 2);
-            t += v_popcount(v_reinterpret_as_u64((a1 | (a1 >> 1)) & mask));
+            v_uint16 a1 = v_or(a0, v_shr<2>(a0));
+            t = v_add(t, v_popcount(v_reinterpret_as_u64(v_and(v_or(a1, v_shr<1>(a1)), mask))));
 
         }
     }
@@ -109,25 +109,25 @@ int normHamming(const uchar* a, const uchar* b, int n, int cellSize)
         return -1;
     int i = 0;
     int result = 0;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     v_uint64 t = vx_setzero_u64();
     if ( cellSize == 2)
     {
         v_uint16 mask = v_reinterpret_as_u16(vx_setall_u8(0x55));
-        for(; i <= n - v_uint8::nlanes; i += v_uint8::nlanes)
+        for(; i <= n - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes())
         {
-            v_uint16 ab0 = v_reinterpret_as_u16(vx_load(a + i) ^ vx_load(b + i));
-            t += v_popcount(v_reinterpret_as_u64((ab0 | (ab0 >> 1)) & mask));
+            v_uint16 ab0 = v_reinterpret_as_u16(v_xor(vx_load(a + i), vx_load(b + i)));
+            t = v_add(t, v_popcount(v_reinterpret_as_u64(v_and(v_or(ab0, v_shr<1>(ab0)), mask))));
         }
     }
     else    // cellSize == 4
     {
         v_uint16 mask = v_reinterpret_as_u16(vx_setall_u8(0x11));
-        for(; i <= n - v_uint8::nlanes; i += v_uint8::nlanes)
+        for(; i <= n - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes())
         {
-            v_uint16 ab0 = v_reinterpret_as_u16(vx_load(a + i) ^ vx_load(b + i));
-            v_uint16 ab1 = ab0 | (ab0 >> 2);
-            t += v_popcount(v_reinterpret_as_u64((ab1 | (ab1 >> 1)) & mask));
+            v_uint16 ab0 = v_reinterpret_as_u16(v_xor(vx_load(a + i), vx_load(b + i)));
+            v_uint16 ab1 = v_or(ab0, v_shr<2>(ab0));
+            t = v_add(t, v_popcount(v_reinterpret_as_u64(v_and(v_or(ab1, v_shr<1>(ab1)), mask))));
         }
     }
     result += (int)v_reduce_sum(t);
@@ -145,21 +145,21 @@ int normHamming(const uchar* a, const uchar* b, int n, int cellSize)
 float normL2Sqr_(const float* a, const float* b, int n)
 {
     int j = 0; float d = 0.f;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     v_float32 v_d0 = vx_setzero_f32(), v_d1 = vx_setzero_f32();
     v_float32 v_d2 = vx_setzero_f32(), v_d3 = vx_setzero_f32();
-    for (; j <= n - 4 * v_float32::nlanes; j += 4 * v_float32::nlanes)
+    for (; j <= n - 4 * VTraits<v_float32>::vlanes(); j += 4 * VTraits<v_float32>::vlanes())
     {
-        v_float32 t0 = vx_load(a + j) - vx_load(b + j);
-        v_float32 t1 = vx_load(a + j + v_float32::nlanes) - vx_load(b + j + v_float32::nlanes);
+        v_float32 t0 = v_sub(vx_load(a + j), vx_load(b + j));
+        v_float32 t1 = v_sub(vx_load(a + j + VTraits<v_float32>::vlanes()), vx_load(b + j + VTraits<v_float32>::vlanes()));
         v_d0 = v_muladd(t0, t0, v_d0);
-        v_float32 t2 = vx_load(a + j + 2 * v_float32::nlanes) - vx_load(b + j + 2 * v_float32::nlanes);
+        v_float32 t2 = v_sub(vx_load(a + j + 2 * VTraits<v_float32>::vlanes()), vx_load(b + j + 2 * VTraits<v_float32>::vlanes()));
         v_d1 = v_muladd(t1, t1, v_d1);
-        v_float32 t3 = vx_load(a + j + 3 * v_float32::nlanes) - vx_load(b + j + 3 * v_float32::nlanes);
+        v_float32 t3 = v_sub(vx_load(a + j + 3 * VTraits<v_float32>::vlanes()), vx_load(b + j + 3 * VTraits<v_float32>::vlanes()));
         v_d2 = v_muladd(t2, t2, v_d2);
         v_d3 = v_muladd(t3, t3, v_d3);
     }
-    d = v_reduce_sum(v_d0 + v_d1 + v_d2 + v_d3);
+    d = v_reduce_sum(v_add(v_add(v_add(v_d0, v_d1), v_d2), v_d3));
 #endif
     for( ; j < n; j++ )
     {
@@ -173,17 +173,17 @@ float normL2Sqr_(const float* a, const float* b, int n)
 float normL1_(const float* a, const float* b, int n)
 {
     int j = 0; float d = 0.f;
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     v_float32 v_d0 = vx_setzero_f32(), v_d1 = vx_setzero_f32();
     v_float32 v_d2 = vx_setzero_f32(), v_d3 = vx_setzero_f32();
-    for (; j <= n - 4 * v_float32::nlanes; j += 4 * v_float32::nlanes)
+    for (; j <= n - 4 * VTraits<v_float32>::vlanes(); j += 4 * VTraits<v_float32>::vlanes())
     {
-        v_d0 += v_absdiff(vx_load(a + j), vx_load(b + j));
-        v_d1 += v_absdiff(vx_load(a + j + v_float32::nlanes), vx_load(b + j + v_float32::nlanes));
-        v_d2 += v_absdiff(vx_load(a + j + 2 * v_float32::nlanes), vx_load(b + j + 2 * v_float32::nlanes));
-        v_d3 += v_absdiff(vx_load(a + j + 3 * v_float32::nlanes), vx_load(b + j + 3 * v_float32::nlanes));
+        v_d0 = v_add(v_d0, v_absdiff(vx_load(a + j), vx_load(b + j)));
+        v_d1 = v_add(v_d1, v_absdiff(vx_load(a + j + VTraits<v_float32>::vlanes()), vx_load(b + j + VTraits<v_float32>::vlanes())));
+        v_d2 = v_add(v_d2, v_absdiff(vx_load(a + j + 2 * VTraits<v_float32>::vlanes()), vx_load(b + j + 2 * VTraits<v_float32>::vlanes())));
+        v_d3 = v_add(v_d3, v_absdiff(vx_load(a + j + 3 * VTraits<v_float32>::vlanes()), vx_load(b + j + 3 * VTraits<v_float32>::vlanes())));
     }
-    d = v_reduce_sum(v_d0 + v_d1 + v_d2 + v_d3);
+    d = v_reduce_sum(v_add(v_add(v_add(v_d0, v_d1), v_d2), v_d3));
 #endif
     for( ; j < n; j++ )
         d += std::abs(a[j] - b[j]);
@@ -193,12 +193,12 @@ float normL1_(const float* a, const float* b, int n)
 int normL1_(const uchar* a, const uchar* b, int n)
 {
     int j = 0, d = 0;
-#if CV_SIMD
-    for (; j <= n - 4 * v_uint8::nlanes; j += 4 * v_uint8::nlanes)
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    for (; j <= n - 4 * VTraits<v_uint8>::vlanes(); j += 4 * VTraits<v_uint8>::vlanes())
         d += v_reduce_sad(vx_load(a + j), vx_load(b + j)) +
-             v_reduce_sad(vx_load(a + j + v_uint8::nlanes), vx_load(b + j + v_uint8::nlanes)) +
-             v_reduce_sad(vx_load(a + j + 2 * v_uint8::nlanes), vx_load(b + j + 2 * v_uint8::nlanes)) +
-             v_reduce_sad(vx_load(a + j + 3 * v_uint8::nlanes), vx_load(b + j + 3 * v_uint8::nlanes));
+             v_reduce_sad(vx_load(a + j + VTraits<v_uint8>::vlanes()), vx_load(b + j + VTraits<v_uint8>::vlanes())) +
+             v_reduce_sad(vx_load(a + j + 2 * VTraits<v_uint8>::vlanes()), vx_load(b + j + 2 * VTraits<v_uint8>::vlanes())) +
+             v_reduce_sad(vx_load(a + j + 3 * VTraits<v_uint8>::vlanes()), vx_load(b + j + 3 * VTraits<v_uint8>::vlanes()));
 #endif
     for( ; j < n; j++ )
         d += std::abs(a[j] - b[j]);
diff --git a/modules/core/src/persistence.cpp b/modules/core/src/persistence.cpp
index cf0a6466ea..f39dfcae7d 100644
--- a/modules/core/src/persistence.cpp
+++ b/modules/core/src/persistence.cpp
@@ -322,16 +322,20 @@ int decodeSimpleFormat( const char* dt )
 
 }
 
-#if defined __i386__ || defined(_M_IX86) || defined __x86_64__ || defined(_M_X64)
-#define CV_UNALIGNED_LITTLE_ENDIAN_MEM_ACCESS 1
+#if defined __i386__ || defined(_M_IX86) || defined __x86_64__ || defined(_M_X64) || \
+    (defined (__LITTLE_ENDIAN__) && __LITTLE_ENDIAN__)
+#define CV_LITTLE_ENDIAN_MEM_ACCESS 1
 #else
-#define CV_UNALIGNED_LITTLE_ENDIAN_MEM_ACCESS 0
+#define CV_LITTLE_ENDIAN_MEM_ACCESS 0
 #endif
 
 static inline int readInt(const uchar* p)
 {
-#if CV_UNALIGNED_LITTLE_ENDIAN_MEM_ACCESS
-    return *(const int*)p;
+    // On little endian CPUs, both branches produce the same result. On big endian, only the else branch does.
+#if CV_LITTLE_ENDIAN_MEM_ACCESS
+    int val;
+    memcpy(&val, p, sizeof(val));
+    return val;
 #else
     int val = (int)(p[0] | (p[1] << 8) | (p[2] << 16) | (p[3] << 24));
     return val;
@@ -340,8 +344,11 @@ static inline int readInt(const uchar* p)
 
 static inline double readReal(const uchar* p)
 {
-#if CV_UNALIGNED_LITTLE_ENDIAN_MEM_ACCESS
-    return *(const double*)p;
+    // On little endian CPUs, both branches produce the same result. On big endian, only the else branch does.
+#if CV_LITTLE_ENDIAN_MEM_ACCESS
+    double val;
+    memcpy(&val, p, sizeof(val));
+    return val;
 #else
     unsigned val0 = (unsigned)(p[0] | (p[1] << 8) | (p[2] << 16) | (p[3] << 24));
     unsigned val1 = (unsigned)(p[4] | (p[5] << 8) | (p[6] << 16) | (p[7] << 24));
@@ -353,9 +360,9 @@ static inline double readReal(const uchar* p)
 
 static inline void writeInt(uchar* p, int ival)
 {
-#if CV_UNALIGNED_LITTLE_ENDIAN_MEM_ACCESS
-    int* ip = (int*)p;
-    *ip = ival;
+    // On little endian CPUs, both branches produce the same result. On big endian, only the else branch does.
+#if CV_LITTLE_ENDIAN_MEM_ACCESS
+    memcpy(p, &ival, sizeof(ival));
 #else
     p[0] = (uchar)ival;
     p[1] = (uchar)(ival >> 8);
@@ -366,9 +373,9 @@ static inline void writeInt(uchar* p, int ival)
 
 static inline void writeReal(uchar* p, double fval)
 {
-#if CV_UNALIGNED_LITTLE_ENDIAN_MEM_ACCESS
-    double* fp = (double*)p;
-    *fp = fval;
+    // On little endian CPUs, both branches produce the same result. On big endian, only the else branch does.
+#if CV_LITTLE_ENDIAN_MEM_ACCESS
+    memcpy(p, &fval, sizeof(fval));
 #else
     Cv64suf v;
     v.f = fval;
diff --git a/modules/core/src/persistence_xml.cpp b/modules/core/src/persistence_xml.cpp
index caba4f5bf0..6141fade2d 100644
--- a/modules/core/src/persistence_xml.cpp
+++ b/modules/core/src/persistence_xml.cpp
@@ -308,8 +308,8 @@ public:
 
         if( !multiline )
         {
-            ptr = fs->resizeWriteBuffer( ptr, len + 9 );
-            sprintf( ptr, "<!-- %s -->", comment );
+            ptr = fs->resizeWriteBuffer( ptr, len + 5+4+1 );
+            snprintf( ptr, len + 5+4+1, "<!-- %s -->", comment );
             len = (int)strlen(ptr);
         }
         else
@@ -344,7 +344,7 @@ public:
                 fs->setBufferPtr(ptr);
                 ptr = fs->flush();
             }
-            sprintf( ptr, "-->" );
+            strcpy( ptr, "-->" );
             fs->setBufferPtr(ptr + 3);
             fs->flush();
         }
diff --git a/modules/core/src/split.simd.hpp b/modules/core/src/split.simd.hpp
index 25e90c0063..88414161b8 100644
--- a/modules/core/src/split.simd.hpp
+++ b/modules/core/src/split.simd.hpp
@@ -15,12 +15,12 @@ void split64s(const int64* src, int64** dst, int len, int cn);
 
 #ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 // see the comments for vecmerge_ in merge.cpp
 template<typename T, typename VecT> static void
 vecsplit_( const T* src, T** dst, int len, int cn )
 {
-    const int VECSZ = VecT::nlanes;
+    const int VECSZ = VTraits<VecT>::vlanes();
     int i, i0 = 0;
     T* dst0 = dst[0];
     T* dst1 = dst[1];
@@ -177,8 +177,8 @@ split_( const T* src, T** dst, int len, int cn )
 void split8u(const uchar* src, uchar** dst, int len, int cn )
 {
     CV_INSTRUMENT_REGION();
-#if CV_SIMD
-    if( len >= v_uint8::nlanes && 2 <= cn && cn <= 4 )
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    if( len >= VTraits<v_uint8>::vlanes() && 2 <= cn && cn <= 4 )
         vecsplit_<uchar, v_uint8>(src, dst, len, cn);
     else
 #endif
@@ -188,8 +188,8 @@ void split8u(const uchar* src, uchar** dst, int len, int cn )
 void split16u(const ushort* src, ushort** dst, int len, int cn )
 {
     CV_INSTRUMENT_REGION();
-#if CV_SIMD
-    if( len >= v_uint16::nlanes && 2 <= cn && cn <= 4 )
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    if( len >= VTraits<v_uint16>::vlanes() && 2 <= cn && cn <= 4 )
         vecsplit_<ushort, v_uint16>(src, dst, len, cn);
     else
 #endif
@@ -199,8 +199,8 @@ void split16u(const ushort* src, ushort** dst, int len, int cn )
 void split32s(const int* src, int** dst, int len, int cn )
 {
     CV_INSTRUMENT_REGION();
-#if CV_SIMD
-    if( len >= v_uint32::nlanes && 2 <= cn && cn <= 4 )
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    if( len >= VTraits<v_uint32>::vlanes() && 2 <= cn && cn <= 4 )
         vecsplit_<int, v_int32>(src, dst, len, cn);
     else
 #endif
@@ -210,8 +210,8 @@ void split32s(const int* src, int** dst, int len, int cn )
 void split64s(const int64* src, int64** dst, int len, int cn )
 {
     CV_INSTRUMENT_REGION();
-#if CV_SIMD
-    if( len >= v_int64::nlanes && 2 <= cn && cn <= 4 )
+#if (CV_SIMD || CV_SIMD_SCALABLE)
+    if( len >= VTraits<v_int64>::vlanes() && 2 <= cn && cn <= 4 )
         vecsplit_<int64, v_int64>(src, dst, len, cn);
     else
 #endif
diff --git a/modules/core/src/stat.simd.hpp b/modules/core/src/stat.simd.hpp
index 0592f84794..a5fb05476d 100644
--- a/modules/core/src/stat.simd.hpp
+++ b/modules/core/src/stat.simd.hpp
@@ -33,11 +33,11 @@ int normHamming(const uchar* a, int n)
     int i = 0;
     int result = 0;
 
-#if CV_SIMD && CV_SIMD_WIDTH > 16
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     {
         v_uint64 t = vx_setzero_u64();
-        for (; i <= n - v_uint8::nlanes; i += v_uint8::nlanes)
-            t += v_popcount(v_reinterpret_as_u64(vx_load(a + i)));
+        for (; i <= n - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes())
+            t = v_add(t, v_popcount(v_reinterpret_as_u64(vx_load(a + i))));
         result = (int)v_reduce_sum(t);
         vx_cleanup();
     }
@@ -56,13 +56,6 @@ int normHamming(const uchar* a, int n)
             result += CV_POPCNT_U32(*(uint*)(a + i));
         }
     }
-#elif CV_SIMD
-    {
-        v_uint64x2 t = v_setzero_u64();
-        for(; i <= n - v_uint8x16::nlanes; i += v_uint8x16::nlanes)
-            t += v_popcount(v_reinterpret_as_u64(v_load(a + i)));
-        result += (int)v_reduce_sum(t);
-    }
 #endif
 #if CV_ENABLE_UNROLLED
     for(; i <= n - 4; i += 4)
@@ -85,11 +78,11 @@ int normHamming(const uchar* a, const uchar* b, int n)
     int i = 0;
     int result = 0;
 
-#if CV_SIMD && CV_SIMD_WIDTH > 16
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     {
         v_uint64 t = vx_setzero_u64();
-        for (; i <= n - v_uint8::nlanes; i += v_uint8::nlanes)
-            t += v_popcount(v_reinterpret_as_u64(vx_load(a + i) ^ vx_load(b + i)));
+        for (; i <= n - VTraits<v_uint8>::vlanes(); i += VTraits<v_uint8>::vlanes())
+            t = v_add(t, v_popcount(v_reinterpret_as_u64(v_xor(vx_load(a + i), vx_load(b + i)))));
         result += (int)v_reduce_sum(t);
     }
 #endif
@@ -107,13 +100,6 @@ int normHamming(const uchar* a, const uchar* b, int n)
             result += CV_POPCNT_U32(*(uint*)(a + i) ^ *(uint*)(b + i));
         }
     }
-#elif CV_SIMD
-    {
-        v_uint64x2 t = v_setzero_u64();
-        for(; i <= n - v_uint8x16::nlanes; i += v_uint8x16::nlanes)
-            t += v_popcount(v_reinterpret_as_u64(v_load(a + i) ^ v_load(b + i)));
-        result += (int)v_reduce_sum(t);
-    }
 #endif
 #if CV_ENABLE_UNROLLED
     for(; i <= n - 4; i += 4)
diff --git a/modules/core/src/sum.dispatch.cpp b/modules/core/src/sum.dispatch.cpp
index a1f7d73868..fade948336 100644
--- a/modules/core/src/sum.dispatch.cpp
+++ b/modules/core/src/sum.dispatch.cpp
@@ -10,11 +10,13 @@
 #include "sum.simd.hpp"
 #include "sum.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content
 
+#ifndef OPENCV_IPP_SUM
 #undef HAVE_IPP
 #undef CV_IPP_RUN_FAST
 #define CV_IPP_RUN_FAST(f, ...)
 #undef CV_IPP_RUN
 #define CV_IPP_RUN(c, f, ...)
+#endif // OPENCV_IPP_SUM
 
 namespace cv
 {
diff --git a/modules/core/src/sum.simd.hpp b/modules/core/src/sum.simd.hpp
index 045f40ebed..f790fc733a 100644
--- a/modules/core/src/sum.simd.hpp
+++ b/modules/core/src/sum.simd.hpp
@@ -22,7 +22,7 @@ struct Sum_SIMD
     }
 };
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
 
 template <>
 struct Sum_SIMD<uchar, int>
@@ -36,41 +36,41 @@ struct Sum_SIMD<uchar, int>
         int x = 0;
         v_uint32 v_sum = vx_setzero_u32();
 
-        int len0 = len & -v_uint8::nlanes;
+        int len0 = len & -VTraits<v_uint8>::vlanes();
         while (x < len0)
         {
-            const int len_tmp = min(x + 256*v_uint16::nlanes, len0);
+            const int len_tmp = min(x + 256*VTraits<v_uint16>::vlanes(), len0);
             v_uint16 v_sum16 = vx_setzero_u16();
-            for (; x < len_tmp; x += v_uint8::nlanes)
+            for (; x < len_tmp; x += VTraits<v_uint8>::vlanes())
             {
                 v_uint16 v_src0, v_src1;
                 v_expand(vx_load(src0 + x), v_src0, v_src1);
-                v_sum16 += v_src0 + v_src1;
+                v_sum16 = v_add(v_sum16, v_add(v_src0, v_src1));
             }
             v_uint32 v_half0, v_half1;
             v_expand(v_sum16, v_half0, v_half1);
-            v_sum += v_half0 + v_half1;
+            v_sum = v_add(v_sum, v_add(v_half0, v_half1));
         }
-        if (x <= len - v_uint16::nlanes)
+        if (x <= len - VTraits<v_uint16>::vlanes())
         {
             v_uint32 v_half0, v_half1;
             v_expand(vx_load_expand(src0 + x), v_half0, v_half1);
-            v_sum += v_half0 + v_half1;
-            x += v_uint16::nlanes;
+            v_sum = v_add(v_sum, v_add(v_half0, v_half1));
+            x += VTraits<v_uint16>::vlanes();
         }
-        if (x <= len - v_uint32::nlanes)
+        if (x <= len - VTraits<v_uint32>::vlanes())
         {
-            v_sum += vx_load_expand_q(src0 + x);
-            x += v_uint32::nlanes;
+            v_sum = v_add(v_sum, vx_load_expand_q(src0 + x));
+            x += VTraits<v_uint32>::vlanes();
         }
 
         if (cn == 1)
             *dst += v_reduce_sum(v_sum);
         else
         {
-            uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[v_uint32::nlanes];
+            uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[VTraits<v_uint32>::max_nlanes];
             v_store_aligned(ar, v_sum);
-            for (int i = 0; i < v_uint32::nlanes; ++i)
+            for (int i = 0; i < VTraits<v_uint32>::vlanes(); ++i)
                 dst[i % cn] += ar[i];
         }
         v_cleanup();
@@ -91,41 +91,41 @@ struct Sum_SIMD<schar, int>
         int x = 0;
         v_int32 v_sum = vx_setzero_s32();
 
-        int len0 = len & -v_int8::nlanes;
+        int len0 = len & -VTraits<v_int8>::vlanes();
         while (x < len0)
         {
-            const int len_tmp = min(x + 256*v_int16::nlanes, len0);
+            const int len_tmp = min(x + 256*VTraits<v_int16>::vlanes(), len0);
             v_int16 v_sum16 = vx_setzero_s16();
-            for (; x < len_tmp; x += v_int8::nlanes)
+            for (; x < len_tmp; x += VTraits<v_int8>::vlanes())
             {
                 v_int16 v_src0, v_src1;
                 v_expand(vx_load(src0 + x), v_src0, v_src1);
-                v_sum16 += v_src0 + v_src1;
+                v_sum16 = v_add(v_sum16, v_add(v_src0, v_src1));
             }
             v_int32 v_half0, v_half1;
             v_expand(v_sum16, v_half0, v_half1);
-            v_sum += v_half0 + v_half1;
+            v_sum = v_add(v_sum, v_add(v_half0, v_half1));
         }
-        if (x <= len - v_int16::nlanes)
+        if (x <= len - VTraits<v_int16>::vlanes())
         {
             v_int32 v_half0, v_half1;
             v_expand(vx_load_expand(src0 + x), v_half0, v_half1);
-            v_sum += v_half0 + v_half1;
-            x += v_int16::nlanes;
+            v_sum = v_add(v_sum, v_add(v_half0, v_half1));
+            x += VTraits<v_int16>::vlanes();
         }
-        if (x <= len - v_int32::nlanes)
+        if (x <= len - VTraits<v_int32>::vlanes())
         {
-            v_sum += vx_load_expand_q(src0 + x);
-            x += v_int32::nlanes;
+            v_sum = v_add(v_sum, vx_load_expand_q(src0 + x));
+            x += VTraits<v_int32>::vlanes();
         }
 
         if (cn == 1)
             *dst += v_reduce_sum(v_sum);
         else
         {
-            int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[v_int32::nlanes];
+            int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[VTraits<v_int32>::max_nlanes];
             v_store_aligned(ar, v_sum);
-            for (int i = 0; i < v_int32::nlanes; ++i)
+            for (int i = 0; i < VTraits<v_int32>::vlanes(); ++i)
                 dst[i % cn] += ar[i];
         }
         v_cleanup();
@@ -146,25 +146,25 @@ struct Sum_SIMD<ushort, int>
         int x = 0;
         v_uint32 v_sum = vx_setzero_u32();
 
-        for (; x <= len - v_uint16::nlanes; x += v_uint16::nlanes)
+        for (; x <= len - VTraits<v_uint16>::vlanes(); x += VTraits<v_uint16>::vlanes())
         {
             v_uint32 v_src0, v_src1;
             v_expand(vx_load(src0 + x), v_src0, v_src1);
-            v_sum += v_src0 + v_src1;
+            v_sum = v_add(v_sum, v_add(v_src0, v_src1));
         }
-        if (x <= len - v_uint32::nlanes)
+        if (x <= len - VTraits<v_uint32>::vlanes())
         {
-            v_sum += vx_load_expand(src0 + x);
-            x += v_uint32::nlanes;
+            v_sum = v_add(v_sum, vx_load_expand(src0 + x));
+            x += VTraits<v_uint32>::vlanes();
         }
 
         if (cn == 1)
             *dst += v_reduce_sum(v_sum);
         else
         {
-            uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[v_uint32::nlanes];
+            uint32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[VTraits<v_uint32>::max_nlanes];
             v_store_aligned(ar, v_sum);
-            for (int i = 0; i < v_uint32::nlanes; ++i)
+            for (int i = 0; i < VTraits<v_uint32>::vlanes(); ++i)
                 dst[i % cn] += ar[i];
         }
         v_cleanup();
@@ -185,25 +185,25 @@ struct Sum_SIMD<short, int>
         int x = 0;
         v_int32 v_sum = vx_setzero_s32();
 
-        for (; x <= len - v_int16::nlanes; x += v_int16::nlanes)
+        for (; x <= len - VTraits<v_int16>::vlanes(); x += VTraits<v_int16>::vlanes())
         {
             v_int32 v_src0, v_src1;
             v_expand(vx_load(src0 + x), v_src0, v_src1);
-            v_sum += v_src0 + v_src1;
+            v_sum = v_add(v_sum, v_add(v_src0, v_src1));
         }
-        if (x <= len - v_int32::nlanes)
+        if (x <= len - VTraits<v_int32>::vlanes())
         {
-            v_sum += vx_load_expand(src0 + x);
-            x += v_int32::nlanes;
+            v_sum = v_add(v_sum, vx_load_expand(src0 + x));
+            x += VTraits<v_int32>::vlanes();
         }
 
         if (cn == 1)
             *dst += v_reduce_sum(v_sum);
         else
         {
-            int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[v_int32::nlanes];
+            int32_t CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[VTraits<v_int32>::max_nlanes];
             v_store_aligned(ar, v_sum);
-            for (int i = 0; i < v_int32::nlanes; ++i)
+            for (int i = 0; i < VTraits<v_int32>::vlanes(); ++i)
                 dst[i % cn] += ar[i];
         }
         v_cleanup();
@@ -212,7 +212,7 @@ struct Sum_SIMD<short, int>
     }
 };
 
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
 template <>
 struct Sum_SIMD<int, double>
 {
@@ -226,24 +226,24 @@ struct Sum_SIMD<int, double>
         v_float64 v_sum0 = vx_setzero_f64();
         v_float64 v_sum1 = vx_setzero_f64();
 
-        for (; x <= len - 2 * v_int32::nlanes; x += 2 * v_int32::nlanes)
+        for (; x <= len - 2 * VTraits<v_int32>::vlanes(); x += 2 * VTraits<v_int32>::vlanes())
         {
             v_int32 v_src0 = vx_load(src0 + x);
-            v_int32 v_src1 = vx_load(src0 + x + v_int32::nlanes);
-            v_sum0 += v_cvt_f64(v_src0) + v_cvt_f64(v_src1);
-            v_sum1 += v_cvt_f64_high(v_src0) + v_cvt_f64_high(v_src1);
+            v_int32 v_src1 = vx_load(src0 + x + VTraits<v_int32>::vlanes());
+            v_sum0 = v_add(v_sum0, v_add(v_cvt_f64(v_src0), v_cvt_f64(v_src1)));
+            v_sum1 = v_add(v_sum1, v_add(v_cvt_f64_high(v_src0), v_cvt_f64_high(v_src1)));
         }
 
 #if CV_SIMD256 || CV_SIMD512
-        double CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[v_float64::nlanes];
-        v_store_aligned(ar, v_sum0 + v_sum1);
-        for (int i = 0; i < v_float64::nlanes; ++i)
+        double CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[VTraits<v_float64>::max_nlanes];
+        v_store_aligned(ar, v_add(v_sum0, v_sum1));
+        for (int i = 0; i < VTraits<v_float64>::vlanes(); ++i)
             dst[i % cn] += ar[i];
 #else
-        double CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[2 * v_float64::nlanes];
+        double CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[2 * VTraits<v_float64>::max_nlanes];
         v_store_aligned(ar, v_sum0);
-        v_store_aligned(ar + v_float64::nlanes, v_sum1);
-        for (int i = 0; i < 2 * v_float64::nlanes; ++i)
+        v_store_aligned(ar + VTraits<v_float64>::vlanes(), v_sum1);
+        for (int i = 0; i < 2 * VTraits<v_float64>::vlanes(); ++i)
             dst[i % cn] += ar[i];
 #endif
         v_cleanup();
@@ -265,24 +265,24 @@ struct Sum_SIMD<float, double>
         v_float64 v_sum0 = vx_setzero_f64();
         v_float64 v_sum1 = vx_setzero_f64();
 
-        for (; x <= len - 2 * v_float32::nlanes; x += 2 * v_float32::nlanes)
+        for (; x <= len - 2 * VTraits<v_float32>::vlanes(); x += 2 * VTraits<v_float32>::vlanes())
         {
             v_float32 v_src0 = vx_load(src0 + x);
-            v_float32 v_src1 = vx_load(src0 + x + v_float32::nlanes);
-            v_sum0 += v_cvt_f64(v_src0) + v_cvt_f64(v_src1);
-            v_sum1 += v_cvt_f64_high(v_src0) + v_cvt_f64_high(v_src1);
+            v_float32 v_src1 = vx_load(src0 + x + VTraits<v_float32>::vlanes());
+            v_sum0 = v_add(v_sum0, v_add(v_cvt_f64(v_src0), v_cvt_f64(v_src1)));
+            v_sum1 = v_add(v_sum1, v_add(v_cvt_f64_high(v_src0), v_cvt_f64_high(v_src1)));
         }
 
 #if CV_SIMD256 || CV_SIMD512
-        double CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[v_float64::nlanes];
-        v_store_aligned(ar, v_sum0 + v_sum1);
-        for (int i = 0; i < v_float64::nlanes; ++i)
+        double CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[VTraits<v_float64>::max_nlanes];
+        v_store_aligned(ar, v_add(v_sum0, v_sum1));
+        for (int i = 0; i < VTraits<v_float64>::vlanes(); ++i)
             dst[i % cn] += ar[i];
 #else
-        double CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[2 * v_float64::nlanes];
+        double CV_DECL_ALIGNED(CV_SIMD_WIDTH) ar[2 * VTraits<v_float64>::max_nlanes];
         v_store_aligned(ar, v_sum0);
-        v_store_aligned(ar + v_float64::nlanes, v_sum1);
-        for (int i = 0; i < 2 * v_float64::nlanes; ++i)
+        v_store_aligned(ar + VTraits<v_float64>::vlanes(), v_sum1);
+        for (int i = 0; i < 2 * VTraits<v_float64>::vlanes(); ++i)
             dst[i % cn] += ar[i];
 #endif
         v_cleanup();
diff --git a/modules/core/src/utils/filesystem.cpp b/modules/core/src/utils/filesystem.cpp
index 17004b27dd..5199f6f57b 100644
--- a/modules/core/src/utils/filesystem.cpp
+++ b/modules/core/src/utils/filesystem.cpp
@@ -34,7 +34,7 @@
 #include <errno.h>
 #include <io.h>
 #include <stdio.h>
-#elif defined __linux__ || defined __APPLE__ || defined __HAIKU__ || defined __FreeBSD__
+#elif defined __linux__ || defined __APPLE__ || defined __HAIKU__ || defined __FreeBSD__ || defined __GNU__
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <fcntl.h>
@@ -343,7 +343,7 @@ private:
     Impl& operator=(const Impl&); // disabled
 };
 
-#elif defined __linux__ || defined __APPLE__ || defined __HAIKU__ || defined __FreeBSD__
+#elif defined __linux__ || defined __APPLE__ || defined __HAIKU__ || defined __FreeBSD__ || defined __GNU__
 
 struct FileLock::Impl
 {
@@ -457,7 +457,7 @@ cv::String getCacheDirectory(const char* sub_directory_name, const char* configu
             default_cache_path = "/tmp/";
             CV_LOG_WARNING(NULL, "Using world accessible cache directory. This may be not secure: " << default_cache_path);
         }
-#elif defined __linux__ || defined __HAIKU__ || defined __FreeBSD__
+#elif defined __linux__ || defined __HAIKU__ || defined __FreeBSD__ || defined __GNU__
         // https://specifications.freedesktop.org/basedir-spec/basedir-spec-latest.html
         if (default_cache_path.empty())
         {
diff --git a/modules/core/test/test_arithm.cpp b/modules/core/test/test_arithm.cpp
index c6756f6502..0b4c010bea 100644
--- a/modules/core/test/test_arithm.cpp
+++ b/modules/core/test/test_arithm.cpp
@@ -2292,6 +2292,139 @@ INSTANTIATE_TEST_CASE_P(Arithm, FlipND, testing::Combine(
     testing::Values(perf::MatType(CV_8UC1), CV_32FC1)
 ));
 
+TEST(BroadcastTo, basic) {
+    std::vector<int> shape_src{2, 1};
+    std::vector<int> data_src{1, 2};
+    Mat src(static_cast<int>(shape_src.size()), shape_src.data(), CV_32SC1, data_src.data());
+
+    auto get_index = [](const std::vector<int>& shape, size_t cnt) {
+        std::vector<int> index(shape.size());
+        size_t t = cnt;
+        for (int i = static_cast<int>(shape.size() - 1); i >= 0; --i) {
+            size_t idx = t / shape[i];
+            index[i] = static_cast<int>(t - idx * shape[i]);
+            t = idx;
+        }
+        return index;
+    };
+
+    auto fn_verify = [&get_index](const Mat& ref, const Mat& res) {
+        // check type
+        EXPECT_EQ(ref.type(), res.type());
+        // check shape
+        EXPECT_EQ(ref.dims, res.dims);
+        for (int i = 0; i < ref.dims; ++i) {
+            EXPECT_EQ(ref.size[i], res.size[i]);
+        }
+        // check value
+        std::vector<int> shape{ref.size.p, ref.size.p + ref.dims};
+        for (size_t i = 0; i < ref.total(); ++i) {
+            auto index = get_index(shape, i);
+            switch (ref.type()) {
+                case CV_32SC1: {
+                    ASSERT_EQ(ref.at<int>(index.data()), res.at<int>(index.data()));
+                } break;
+                case CV_8UC1: {
+                    ASSERT_EQ(ref.at<uint8_t>(index.data()), res.at<uint8_t>(index.data()));
+                } break;
+                case CV_32FC1: {
+                    ASSERT_EQ(ref.at<float>(index.data()), res.at<float>(index.data()));
+                } break;
+                default: FAIL() << "Unsupported type: " << ref.type();
+            }
+        }
+    };
+
+    {
+        std::vector<int> shape{4, 2, 3};
+        std::vector<int> data_ref{
+            1, 1, 1, // [0, 0, :]
+            2, 2, 2, // [0, 1, :]
+            1, 1, 1, // [1, 0, :]
+            2, 2, 2, // [1, 1, :]
+            1, 1, 1, // [2, 0, :]
+            2, 2, 2, // [2, 1, :]
+            1, 1, 1, // [3, 0, :]
+            2, 2, 2  // [3, 1, :]
+        };
+        Mat ref(static_cast<int>(shape.size()), shape.data(), src.type(), data_ref.data());
+        Mat dst;
+        broadcast(src, shape, dst);
+        fn_verify(ref, dst);
+    }
+
+    {
+        Mat _src;
+        src.convertTo(_src, CV_8U);
+        std::vector<int> shape{4, 2, 3};
+        std::vector<uint8_t> data_ref{
+            1, 1, 1, // [0, 0, :]
+            2, 2, 2, // [0, 1, :]
+            1, 1, 1, // [1, 0, :]
+            2, 2, 2, // [1, 1, :]
+            1, 1, 1, // [2, 0, :]
+            2, 2, 2, // [2, 1, :]
+            1, 1, 1, // [3, 0, :]
+            2, 2, 2  // [3, 1, :]
+        };
+        Mat ref(static_cast<int>(shape.size()), shape.data(), _src.type(), data_ref.data());
+        Mat dst;
+        broadcast(_src, shape, dst);
+        fn_verify(ref, dst);
+    }
+
+    {
+        Mat _src;
+        src.convertTo(_src, CV_32F);
+        std::vector<int> shape{1, 1, 2, 1}; // {2, 1}
+        std::vector<float> data_ref{
+            1.f, // [0, 0, 0, 0]
+            2.f, // [0, 0, 1, 0]
+        };
+        Mat ref(static_cast<int>(shape.size()), shape.data(), _src.type(), data_ref.data());
+        Mat dst;
+        broadcast(_src, shape, dst);
+        fn_verify(ref, dst);
+    }
+
+    {
+        std::vector<int> _shape_src{2, 3, 4};
+        std::vector<float> _data_src{
+            1.f, 2.f, 3.f, 4.f, // [0, 0, :]
+            2.f, 3.f, 4.f, 5.f, // [0, 1, :]
+            3.f, 4.f, 5.f, 6.f, // [0, 2, :]
+
+            4.f, 5.f, 6.f, 7.f, // [1, 0, :]
+            5.f, 6.f, 7.f, 8.f, // [1, 1, :]
+            6.f, 7.f, 8.f, 9.f, // [1, 2, :]
+        };
+        Mat _src(static_cast<int>(_shape_src.size()), _shape_src.data(), CV_32FC1, _data_src.data());
+
+        std::vector<int> shape{2, 1, 2, 3, 4};
+        std::vector<float> data_ref{
+            1.f, 2.f, 3.f, 4.f, // [0, 0, 0, 0, :]
+            2.f, 3.f, 4.f, 5.f, // [0, 0, 0, 1, :]
+            3.f, 4.f, 5.f, 6.f, // [0, 0, 0, 2, :]
+
+            4.f, 5.f, 6.f, 7.f, // [0, 0, 1, 0, :]
+            5.f, 6.f, 7.f, 8.f, // [0, 0, 1, 1, :]
+            6.f, 7.f, 8.f, 9.f, // [0, 0, 1, 2, :]
+
+            1.f, 2.f, 3.f, 4.f, // [1, 0, 0, 0, :]
+            2.f, 3.f, 4.f, 5.f, // [1, 0, 0, 1, :]
+            3.f, 4.f, 5.f, 6.f, // [1, 0, 0, 2, :]
+
+            4.f, 5.f, 6.f, 7.f, // [1, 0, 1, 0, :]
+            5.f, 6.f, 7.f, 8.f, // [1, 0, 1, 1, :]
+            6.f, 7.f, 8.f, 9.f, // [1, 0, 1, 2, :]
+        };
+        Mat ref(static_cast<int>(shape.size()), shape.data(), _src.type(), data_ref.data());
+        Mat dst;
+        broadcast(_src, shape, dst);
+        fn_verify(ref, dst);
+    }
+}
+
 TEST(Core_minMaxIdx, regression_9207_2)
 {
     const int rows = 13;
diff --git a/modules/core/test/test_countnonzero.cpp b/modules/core/test/test_countnonzero.cpp
index fe14affb9c..41eaceb189 100644
--- a/modules/core/test/test_countnonzero.cpp
+++ b/modules/core/test/test_countnonzero.cpp
@@ -259,7 +259,7 @@ TEST_P (CountNonZeroND, ndim)
     const int ONE_SIZE = 5;
 
     vector<int> sizes(dims);
-    fill(sizes.begin(), sizes.end(), ONE_SIZE);
+    std::fill(sizes.begin(), sizes.end(), ONE_SIZE);
 
     Mat data(sizes, CV_MAKETYPE(type, 1));
     data = 0;
diff --git a/modules/core/test/test_intrin_utils.hpp b/modules/core/test/test_intrin_utils.hpp
index 481e6bb1f2..1ece6de82f 100644
--- a/modules/core/test/test_intrin_utils.hpp
+++ b/modules/core/test/test_intrin_utils.hpp
@@ -1475,12 +1475,15 @@ template<typename R> struct TheTest
     TheTest & test_float_math()
     {
         typedef typename V_RegTraits<R>::round_reg Ri;
-        Data<R> data1, data2, data3;
+        Data<R> data1, data1_border, data2, data3;
+        // See https://github.com/opencv/opencv/issues/24213
+        data1_border *= 0.5;
         data1 *= 1.1;
         data2 += 10;
-        R a1 = data1, a2 = data2, a3 = data3;
+        R a1 = data1, a1_border = data1_border, a2 = data2, a3 = data3;
 
         Data<Ri> resB = v_round(a1),
+                 resB_border = v_round(a1_border),
                  resC = v_trunc(a1),
                  resD = v_floor(a1),
                  resE = v_ceil(a1);
@@ -1493,6 +1496,7 @@ template<typename R> struct TheTest
         {
             SCOPED_TRACE(cv::format("i=%d", i));
             EXPECT_EQ(cvRound(data1[i]), resB[i]);
+            EXPECT_EQ(cvRound(data1_border[i]), resB_border[i]);
             EXPECT_EQ((typename VTraits<Ri>::lane_type)data1[i], resC[i]);
             EXPECT_EQ(cvFloor(data1[i]), resD[i]);
             EXPECT_EQ(cvCeil(data1[i]), resE[i]);
diff --git a/modules/dnn/CMakeLists.txt b/modules/dnn/CMakeLists.txt
index 804b78ead2..774e3c7b5a 100644
--- a/modules/dnn/CMakeLists.txt
+++ b/modules/dnn/CMakeLists.txt
@@ -58,11 +58,6 @@ endif()
 ocv_cmake_hook_append(INIT_MODULE_SOURCES_opencv_dnn "${CMAKE_CURRENT_LIST_DIR}/cmake/hooks/INIT_MODULE_SOURCES_opencv_dnn.cmake")
 
 
-if(HAVE_TENGINE)
-  ocv_target_compile_definitions(${the_module} PRIVATE "HAVE_TENGINE=1")
-endif()
-
-
 if(MSVC)
   add_definitions( -D_CRT_SECURE_NO_WARNINGS=1 )
   ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4244 /wd4267 /wd4018 /wd4355 /wd4800 /wd4251 /wd4996 /wd4146
@@ -172,11 +167,6 @@ else()
   set(sources_options ${sources_options} EXCLUDE_CUDA)
 endif()
 
-if(HAVE_TENGINE)
-	list(APPEND include_dirs ${TENGINE_INCLUDE_DIRS})
-	list(APPEND libs -Wl,--whole-archive ${TENGINE_LIBRARIES} -Wl,--no-whole-archive)
-endif()
-
 if(HAVE_TIMVX)
     list(APPEND include_dirs ${TIMVX_INCLUDE_DIR})
     list(APPEND libs -Wl,--whole-archive ${TIMVX_LIBRARY} -Wl,--no-whole-archive)
@@ -237,6 +227,10 @@ if(TARGET ocv.3rdparty.openvino AND OPENCV_DNN_OPENVINO)
   endif()
 endif()
 
+set(OPENCV_DNN_BACKEND_DEFAULT "" CACHE STRING "Default backend used by the DNN module (DNN_BACKEND_OPENCV if empty)")
+if(OPENCV_DNN_BACKEND_DEFAULT)
+  ocv_append_source_file_compile_definitions("${CMAKE_CURRENT_LIST_DIR}/src/dnn_params.cpp" "OPENCV_DNN_BACKEND_DEFAULT=${OPENCV_DNN_BACKEND_DEFAULT}")
+endif()
 
 ocv_install_used_external_targets(${libs} ${dnn_runtime_libs})
 
diff --git a/modules/dnn/include/opencv2/dnn/dnn.hpp b/modules/dnn/include/opencv2/dnn/dnn.hpp
index d61f7191bc..2bd3f790b4 100644
--- a/modules/dnn/include/opencv2/dnn/dnn.hpp
+++ b/modules/dnn/include/opencv2/dnn/dnn.hpp
@@ -69,9 +69,7 @@ CV__DNN_INLINE_NS_BEGIN
      */
     enum Backend
     {
-        //! DNN_BACKEND_DEFAULT equals to DNN_BACKEND_INFERENCE_ENGINE if
-        //! OpenCV is built with Intel OpenVINO or
-        //! DNN_BACKEND_OPENCV otherwise.
+        //! DNN_BACKEND_DEFAULT equals to OPENCV_DNN_BACKEND_DEFAULT, which can be defined using CMake or a configuration parameter
         DNN_BACKEND_DEFAULT = 0,
         DNN_BACKEND_HALIDE,
         DNN_BACKEND_INFERENCE_ENGINE,            //!< Intel OpenVINO computational backend
@@ -688,9 +686,6 @@ CV__DNN_INLINE_NS_BEGIN
          * @brief Ask network to use specific computation backend where it supported.
          * @param[in] backendId backend identifier.
          * @see Backend
-         *
-         * If OpenCV is compiled with Intel's Inference Engine library, DNN_BACKEND_DEFAULT
-         * means DNN_BACKEND_INFERENCE_ENGINE. Otherwise it equals to DNN_BACKEND_OPENCV.
          */
         CV_WRAP void setPreferableBackend(int backendId);
 
diff --git a/modules/dnn/misc/python/test/test_dnn.py b/modules/dnn/misc/python/test/test_dnn.py
index 5c91aae56f..a06c02ad2d 100644
--- a/modules/dnn/misc/python/test/test_dnn.py
+++ b/modules/dnn/misc/python/test/test_dnn.py
@@ -191,10 +191,10 @@ class dnn_test(NewOpenCVTests):
 
     def test_model(self):
         img_path = self.find_dnn_file("dnn/street.png")
-        weights = self.find_dnn_file("dnn/MobileNetSSD_deploy.caffemodel", required=False)
-        config = self.find_dnn_file("dnn/MobileNetSSD_deploy.prototxt", required=False)
+        weights = self.find_dnn_file("dnn/MobileNetSSD_deploy_19e3ec3.caffemodel", required=False)
+        config = self.find_dnn_file("dnn/MobileNetSSD_deploy_19e3ec3.prototxt", required=False)
         if weights is None or config is None:
-            raise unittest.SkipTest("Missing DNN test files (dnn/MobileNetSSD_deploy.{prototxt/caffemodel}). Verify OPENCV_DNN_TEST_DATA_PATH configuration parameter.")
+            raise unittest.SkipTest("Missing DNN test files (dnn/MobileNetSSD_deploy_19e3ec3.{prototxt/caffemodel}). Verify OPENCV_DNN_TEST_DATA_PATH configuration parameter.")
 
         frame = cv.imread(img_path)
         model = cv.dnn_DetectionModel(weights, config)
diff --git a/modules/dnn/perf/perf_caffe.cpp b/modules/dnn/perf/perf_caffe.cpp
index 370f06dba2..f1ba26afcc 100644
--- a/modules/dnn/perf/perf_caffe.cpp
+++ b/modules/dnn/perf/perf_caffe.cpp
@@ -101,8 +101,8 @@ PERF_TEST(SqueezeNet_v1_1_caffe, CaffePerfTest)
 
 PERF_TEST(MobileNet_SSD, CaffePerfTest)
 {
-    caffe::Net<float>* net = initNet("dnn/MobileNetSSD_deploy.prototxt",
-                                     "dnn/MobileNetSSD_deploy.caffemodel");
+    caffe::Net<float>* net = initNet("dnn/MobileNetSSD_deploy_19e3ec3.prototxt",
+                                     "dnn/MobileNetSSD_deploy_19e3ec3.caffemodel");
     TEST_CYCLE() net->Forward();
     SANITY_CHECK_NOTHING();
 }
diff --git a/modules/dnn/perf/perf_layer.cpp b/modules/dnn/perf/perf_layer.cpp
index 261bc5c3ca..3020dbea66 100644
--- a/modules/dnn/perf/perf_layer.cpp
+++ b/modules/dnn/perf/perf_layer.cpp
@@ -678,7 +678,6 @@ PERF_TEST_P_(Layer_FullyConnected, fc)
     lp.set("axis", input.dims - 1);
     lp.set("is_matmul", weights.dims > 2);
     lp.set("bias_term", false);
-    lp.set("transB", true);
     lp.set("num_output", (int)weights.total(0, weights.dims - 1));
     lp.blobs.resize(1, weights);
 
diff --git a/modules/dnn/perf/perf_net.cpp b/modules/dnn/perf/perf_net.cpp
index cfbb45b173..7f852e8f7b 100644
--- a/modules/dnn/perf/perf_net.cpp
+++ b/modules/dnn/perf/perf_net.cpp
@@ -141,7 +141,7 @@ PERF_TEST_P_(DNNTestNetwork, MobileNet_SSD_Caffe)
 {
     if (backend == DNN_BACKEND_HALIDE)
         throw SkipTestException("");
-    processNet("dnn/MobileNetSSD_deploy.caffemodel", "dnn/MobileNetSSD_deploy.prototxt", "",
+    processNet("dnn/MobileNetSSD_deploy_19e3ec3.caffemodel", "dnn/MobileNetSSD_deploy_19e3ec3.prototxt", "",
             Mat(cv::Size(300, 300), CV_32FC3));
 }
 
diff --git a/modules/dnn/src/dnn_params.cpp b/modules/dnn/src/dnn_params.cpp
index 86a43db757..a76f4cd512 100644
--- a/modules/dnn/src/dnn_params.cpp
+++ b/modules/dnn/src/dnn_params.cpp
@@ -36,7 +36,11 @@ bool getParam_DNN_OPENCL_ALLOW_ALL_DEVICES()
 int getParam_DNN_BACKEND_DEFAULT()
 {
     static int PARAM_DNN_BACKEND_DEFAULT = (int)utils::getConfigurationParameterSizeT("OPENCV_DNN_BACKEND_DEFAULT",
+#ifdef OPENCV_DNN_BACKEND_DEFAULT
+            (size_t)OPENCV_DNN_BACKEND_DEFAULT
+#else
             (size_t)DNN_BACKEND_OPENCV
+#endif
     );
     return PARAM_DNN_BACKEND_DEFAULT;
 }
diff --git a/modules/dnn/src/dnn_utils.cpp b/modules/dnn/src/dnn_utils.cpp
index 18c7e975eb..d4d7dda008 100644
--- a/modules/dnn/src/dnn_utils.cpp
+++ b/modules/dnn/src/dnn_utils.cpp
@@ -5,6 +5,7 @@
 #include "precomp.hpp"
 
 #include <opencv2/imgproc.hpp>
+#include <opencv2/core/utils/logger.hpp>
 
 
 namespace cv {
@@ -100,15 +101,29 @@ void blobFromImagesWithParams(InputArrayOfArrays images_, OutputArray blob_, con
     images_.getMatVector(images);
     CV_Assert(!images.empty());
 
-    int nch = images[0].channels();
-    Scalar scalefactor = param.scalefactor;
-
     if (param.ddepth == CV_8U)
     {
-        CV_Assert(scalefactor == Scalar::all(1.0) && "Scaling is not supported for CV_8U blob depth");
+        CV_Assert(param.scalefactor == Scalar::all(1.0) && "Scaling is not supported for CV_8U blob depth");
         CV_Assert(param.mean == Scalar() && "Mean subtraction is not supported for CV_8U blob depth");
     }
 
+    int nch = images[0].channels();
+    Scalar scalefactor = param.scalefactor;
+    Scalar mean = param.mean;
+
+    if (param.swapRB)
+    {
+        if (nch > 2)
+        {
+            std::swap(mean[0], mean[2]);
+            std::swap(scalefactor[0], scalefactor[2]);
+        }
+        else
+        {
+            CV_LOG_WARNING(NULL, "Red/blue color swapping requires at least three image channels.");
+        }
+    }
+
     for (size_t i = 0; i < images.size(); i++)
     {
         Size imgSize = images[i].size();
@@ -126,34 +141,26 @@ void blobFromImagesWithParams(InputArrayOfArrays images_, OutputArray blob_, con
                           size);
                 images[i] = images[i](crop);
             }
+            else if (param.paddingmode == DNN_PMODE_LETTERBOX)
+            {
+                float resizeFactor = std::min(size.width / (float)imgSize.width,
+                                              size.height / (float)imgSize.height);
+                int rh = int(imgSize.height * resizeFactor);
+                int rw = int(imgSize.width * resizeFactor);
+                resize(images[i], images[i], Size(rw, rh), INTER_LINEAR);
+
+                int top = (size.height - rh)/2;
+                int bottom = size.height - top - rh;
+                int left = (size.width - rw)/2;
+                int right = size.width - left - rw;
+                copyMakeBorder(images[i], images[i], top, bottom, left, right, BORDER_CONSTANT);
+            }
             else
             {
-                if (param.paddingmode == DNN_PMODE_LETTERBOX)
-                {
-                    float resizeFactor = std::min(size.width / (float)imgSize.width,
-                                                  size.height / (float)imgSize.height);
-                    int rh = int(imgSize.height * resizeFactor);
-                    int rw = int(imgSize.width * resizeFactor);
-                    resize(images[i], images[i], Size(rw, rh), INTER_LINEAR);
-
-                    int top = (size.height - rh)/2;
-                    int bottom = size.height - top - rh;
-                    int left = (size.width - rw)/2;
-                    int right = size.width - left - rw;
-                    copyMakeBorder(images[i], images[i], top, bottom, left, right, BORDER_CONSTANT);
-                }
-                else
-                    resize(images[i], images[i], size, 0, 0, INTER_LINEAR);
+                resize(images[i], images[i], size, 0, 0, INTER_LINEAR);
             }
         }
 
-        Scalar mean = param.mean;
-        if (param.swapRB)
-        {
-            std::swap(mean[0], mean[2]);
-            std::swap(scalefactor[0], scalefactor[2]);
-        }
-
         if (images[i].depth() == CV_8U && param.ddepth == CV_32F)
             images[i].convertTo(images[i], CV_32F);
 
@@ -220,18 +227,22 @@ void blobFromImagesWithParams(InputArrayOfArrays images_, OutputArray blob_, con
             CV_Assert(image.depth() == blob_.depth());
             CV_Assert(image.channels() == image0.channels());
             CV_Assert(image.size() == image0.size());
-            if (param.swapRB)
+            if (nch > 2 && param.swapRB)
             {
                 Mat tmpRB;
                 cvtColor(image, tmpRB, COLOR_BGR2RGB);
                 tmpRB.copyTo(Mat(tmpRB.rows, tmpRB.cols, subMatType, blob.ptr((int)i, 0)));
             }
             else
+            {
                 image.copyTo(Mat(image.rows, image.cols, subMatType, blob.ptr((int)i, 0)));
+            }
         }
     }
     else
+    {
         CV_Error(Error::StsUnsupportedFormat, "Unsupported data layout in blobFromImagesWithParams function.");
+    }
 }
 
 void imagesFromBlob(const cv::Mat& blob_, OutputArrayOfArrays images_)
diff --git a/modules/dnn/src/ie_ngraph.cpp b/modules/dnn/src/ie_ngraph.cpp
index 140d4b0d2f..f9341febb5 100644
--- a/modules/dnn/src/ie_ngraph.cpp
+++ b/modules/dnn/src/ie_ngraph.cpp
@@ -383,11 +383,17 @@ public:
 
 #endif // OpenVINO >= 2022.1
 
-InfEngineNgraphNode::InfEngineNgraphNode(std::shared_ptr<ngraph::Node>&& _node)
-    : BackendNode(DNN_BACKEND_INFERENCE_ENGINE_NGRAPH), node(std::move(_node)) {}
+InfEngineNgraphNode::InfEngineNgraphNode(ngraph::Output<ngraph::Node>&& _node)
+    : BackendNode(DNN_BACKEND_INFERENCE_ENGINE_NGRAPH), node(std::move(_node)) {
+    CV_Assert(node.get_node());
+    CV_Assert(node.get_node_shared_ptr());
+}
 
-InfEngineNgraphNode::InfEngineNgraphNode(const std::shared_ptr<ngraph::Node>& _node)
-    : BackendNode(DNN_BACKEND_INFERENCE_ENGINE_NGRAPH), node(_node) {}
+InfEngineNgraphNode::InfEngineNgraphNode(const ngraph::Output<ngraph::Node>& _node)
+    : BackendNode(DNN_BACKEND_INFERENCE_ENGINE_NGRAPH), node(_node) {
+    CV_Assert(node.get_node());
+    CV_Assert(node.get_node_shared_ptr());
+}
 
 InfEngineNgraphNode::InfEngineNgraphNode(const std::vector<Ptr<BackendNode> >& nodes,
                                          Ptr<Layer>& cvLayer_, std::vector<Mat*>& inputs,
@@ -420,7 +426,7 @@ InfEngineNgraphNode::InfEngineNgraphNode(const std::vector<Ptr<BackendNode> >& n
 }
 
 void InfEngineNgraphNode::setName(const std::string& name) {
-    node->set_friendly_name(name);
+    node.get_node()->set_friendly_name(name);
 }
 
 InfEngineNgraphNet::InfEngineNgraphNet(detail::NetImplBase& netImpl)
@@ -441,8 +447,7 @@ InfEngineNgraphNet::InfEngineNgraphNet(detail::NetImplBase& netImpl, InferenceEn
 void InfEngineNgraphNet::addOutput(const Ptr<InfEngineNgraphNode>& node)
 {
     CV_Assert(node);
-    CV_Assert(node->node);
-    const std::string& name = node->node->get_friendly_name();
+    const std::string& name = node->node.get_node()->get_friendly_name();
     requestedOutputs.insert({name, node.get()});
 }
 
@@ -458,7 +463,7 @@ void InfEngineNgraphNet::createNet(Target targetId) {
             CV_Assert(output_node_it->second);
             auto out = std::make_shared<ngraph::op::Result>(output_node_it->second->node);
 #if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2022_1)
-            out->set_friendly_name(output_node_it->first + (output_node_it->second->node->get_output_size() == 1 ? "" : ".0"));
+            out->set_friendly_name(output_node_it->first + (output_node_it->second->node.get_node()->get_output_size() == 1 ? "" : ".0"));
 #endif
             outs.push_back(out);
         }
diff --git a/modules/dnn/src/ie_ngraph.hpp b/modules/dnn/src/ie_ngraph.hpp
index 7bb0ac09df..cc8f53ca5c 100644
--- a/modules/dnn/src/ie_ngraph.hpp
+++ b/modules/dnn/src/ie_ngraph.hpp
@@ -93,13 +93,13 @@ public:
                         std::vector<Mat*>& inputs, std::vector<Mat>& outputs,
                         std::vector<Mat>& internals);
 
-    InfEngineNgraphNode(std::shared_ptr<ngraph::Node>&& _node);
-    InfEngineNgraphNode(const std::shared_ptr<ngraph::Node>& _node);
+    InfEngineNgraphNode(ngraph::Output<ngraph::Node>&& _node);
+    InfEngineNgraphNode(const ngraph::Output<ngraph::Node>& _node);
 
     void setName(const std::string& name);
 
     // Inference Engine network object that allows to obtain the outputs of this layer.
-    std::shared_ptr<ngraph::Node> node;
+    ngraph::Output<ngraph::Node> node;
     Ptr<InfEngineNgraphNet> net;
     Ptr<dnn::Layer> cvLayer;
 };
diff --git a/modules/dnn/src/layers/batch_norm_layer.cpp b/modules/dnn/src/layers/batch_norm_layer.cpp
index b90ee934ef..1d95096e60 100644
--- a/modules/dnn/src/layers/batch_norm_layer.cpp
+++ b/modules/dnn/src/layers/batch_norm_layer.cpp
@@ -457,7 +457,7 @@ public:
     virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs, const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
     {
         auto ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
-        std::vector<size_t> shape(ieInpNode->get_shape().size(), 1);
+        std::vector<size_t> shape(ieInpNode.get_shape().size(), 1);
         shape[1] = weights_.total();
         auto weight = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape(shape), weights_.data);
         auto bias = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape(shape), bias_.data);
diff --git a/modules/dnn/src/layers/blank_layer.cpp b/modules/dnn/src/layers/blank_layer.cpp
index 3095e2d6c9..16de23b15e 100644
--- a/modules/dnn/src/layers/blank_layer.cpp
+++ b/modules/dnn/src/layers/blank_layer.cpp
@@ -148,7 +148,7 @@ public:
     virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
                                         const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
     {
-        auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+        auto ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
         ngraph::OutputVector inp{ieInpNode};
         auto blank = std::make_shared<ngraph::op::Concat>(inp, 0);
         return Ptr<BackendNode>(new InfEngineNgraphNode(blank));
diff --git a/modules/dnn/src/layers/concat_layer.cpp b/modules/dnn/src/layers/concat_layer.cpp
index 6bd3dcdea5..a5af16f32e 100644
--- a/modules/dnn/src/layers/concat_layer.cpp
+++ b/modules/dnn/src/layers/concat_layer.cpp
@@ -392,7 +392,7 @@ public:
     virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
                                         const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
     {
-        const int numDims = nodes[0].dynamicCast<InfEngineNgraphNode>()->node->get_shape().size();
+        const int numDims = nodes[0].dynamicCast<InfEngineNgraphNode>()->node.get_shape().size();
         const int cAxis = normalize_axis(axis, numDims);
         std::vector<size_t> maxDims(numDims, 0);
 
@@ -403,7 +403,7 @@ public:
             auto inp = nodes[i].dynamicCast<InfEngineNgraphNode>()->node;
             inp_nodes.push_back(inp);
 
-            std::vector<size_t> inpShape = inp->get_shape();
+            std::vector<size_t> inpShape = inp.get_shape();
             for (int i = 0; i < numDims; ++i)
                 maxDims[i] = std::max(maxDims[i], inpShape[i]);
         }
diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp
index 2787d64880..d6e0aba1c6 100644
--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@@ -62,9 +62,6 @@
 #include "opencl_kernels_dnn.hpp"
 using namespace cv::dnn::ocl4dnn;
 #endif
-#ifdef HAVE_TENGINE
-#include "../tengine4dnn/include/tengine_graph_convolution.hpp"
-#endif
 
 #ifdef HAVE_CUDA
 #include "../cuda4dnn/primitives/convolution.hpp"
@@ -267,10 +264,6 @@ public:
     float power;
 #endif
 
-#ifdef HAVE_TENGINE
-    teng_graph_t tengine_graph;
-#endif
-
 #ifdef HAVE_CUDA
     cuda4dnn::ConvolutionConfiguration::FusionMode cudaFusionMode;
     cuda4dnn::ConvolutionConfiguration::ActivationType cudaActType;
@@ -289,20 +282,8 @@ public:
 #ifdef HAVE_CUDA
         cudaFusionMode = cuda4dnn::ConvolutionConfiguration::FusionMode::NONE;
         cudaActType = cuda4dnn::ConvolutionConfiguration::ActivationType::IDENTITY;
-#endif
-#ifdef HAVE_TENGINE
-        tengine_graph=NULL;
 #endif
     }
-#ifdef HAVE_TENGINE
-    ~ConvolutionLayerImpl()
-    {
-        if(NULL != tengine_graph )
-        {
-            tengine_release(tengine_graph);
-        }
-    }
-#endif
 
     MatShape computeColRowShape(const MatShape &inpShape, const MatShape &outShape) const CV_OVERRIDE
     {
@@ -466,13 +447,6 @@ public:
             for(int i = 0; i < numOutput; i++ )
                 biasvec[i] = biasMat.at<float>(i);
         }
-#ifdef HAVE_TENGINE
-        if(NULL != tengine_graph )
-        {
-            tengine_release(tengine_graph);
-            tengine_graph = NULL ;
-        }
-#endif
 #ifdef HAVE_OPENCL
         convolutionOp.release();
 #endif
@@ -848,13 +822,13 @@ public:
         CV_Assert(!blobs.empty());
         CV_Assert_N(inputs.size() >= 1, nodes.size() >= 1);
         auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
-        std::vector<size_t> dims = ieInpNode->get_shape();
+        std::vector<size_t> dims = ieInpNode.get_shape();
         CV_Check(dims.size(), dims.size() >= 3 && dims.size() <= 5, "");
-        std::shared_ptr<ngraph::Node> ieWeights = nodes.size() > 1 ? nodes[1].dynamicCast<InfEngineNgraphNode>()->node : nullptr;
+        ngraph::Output<ngraph::Node> ieWeights;
         if (nodes.size() > 1)
-            CV_Assert(ieWeights);  // dynamic_cast should not fail
+            ieWeights = nodes[1].dynamicCast<InfEngineNgraphNode>()->node;
         const int inpCn = dims[1];
-        const int inpGroupCn = nodes.size() > 1 ? ieWeights->get_shape()[1] : blobs[0].size[1];
+        const int inpGroupCn = nodes.size() > 1 ? ieWeights.get_shape()[1] : blobs[0].size[1];
         const int group = inpCn / inpGroupCn;
 
         std::vector<size_t> kernel_shape;
@@ -1095,7 +1069,7 @@ public:
             config.pads = pads;
             config.stride = stride;
             config.dilation = dilation;
-            if (inputs[0].dims != 4 && inputs[0].dims != umat_blobs[0].dims)
+            if (inputs[0].dims != 4 && inputs[0].dims != (blobs.empty() ? umat_blobs[0].dims : blobs[0].dims))
             {
                 static bool bypassCheck = utils::getConfigurationParameterBool("OPENCV_OCL4DNN_CONVOLUTION_IGNORE_INPUT_DIMS_4_CHECK", false);
                 if (!bypassCheck)
@@ -1107,7 +1081,7 @@ public:
                     return false;
                 }
             }
-            config.group = inputs[0].size[1] / umat_blobs[0].size[1];
+            config.group = inputs[0].size[1] / (blobs.empty() ? umat_blobs[0].size[1] : blobs[0].size[1]);
             if (config.group < 1)  // config.group == 0 causes div by zero in ocl4dnn code
             {
                 CV_LOG_WARNING(NULL, "DNN/OpenCL: Unsupported config.group=" << config.group
@@ -1305,65 +1279,6 @@ public:
             }
         }
 
-#ifdef HAVE_TENGINE
-        bool tengine_ret = false;
-
-        std::vector<Mat> teng_in, teng_out;
-        inputs_arr.getMatVector(teng_in);
-        outputs_arr.getMatVector(teng_out);
-
-        int inch = teng_in[0].size[1];    // inch
-        int in_h = teng_in[0].size[2];    // in_h
-        int in_w = teng_in[0].size[3];    // in_w
-
-        int out_b = teng_out[0].size[0];  // out batch size
-        int outch = teng_out[0].size[1];  // outch
-        int out_h = teng_out[0].size[2];  // out_h
-        int out_w = teng_out[0].size[3];  // out_w
-
-        float *input_  = teng_in[0].ptr<float>();
-        float *output_ = teng_out[0].ptr<float>();
-        float *kernel_ = weightsMat.ptr<float>();
-        float *teg_bias = &biasvec[0];
-
-        int nstripes = std::max(getNumThreads(), 1);
-
-        /* tengine_init will run when first time. */
-        if(NULL == tengine_graph)
-        {
-            // pads_begin: 0 - pad_top,    1 - pad_left
-            // pads_end:   0 - pad_bottom, 1 - pad_right
-            // pad_h0: pad_top,  pad_h1: pad_bottom
-            // pad_w0: pad_left, pad_w1: pad_right
-            tengine_graph = tengine_init(name.c_str(), input_, inch, ngroups, in_h, in_w,
-                                         output_, out_b, outch, out_h, out_w,
-                                         kernel_, kernel_size.size(), kernel.height, kernel.width,
-                                         teg_bias, stride.height, stride.width,
-                                         pads_begin[0], pads_end[0], pads_begin[1], pads_end[1], dilation.height, dilation.width,
-                                         weightsMat.step1(), padMode, tengine_graph, nstripes);
-            // printf("Init(%s):  input=%p(%d %d %d %d ),output=%p(%d %d %d %d ),kernel=%p(%ld %d %d ), bias=%p ,"
-            //        "stride(%d %d), pad(%d %d %d %d), dilation(%d %d) ,weightsMat=%ld, padMode=%s ,tengine_graph = %p \n",
-            //        name.c_str(),input_, inch, ngroups, in_h, in_w,
-            //        output_, out_b, outch, out_h, out_w,
-            //        kernel_, kernel_size.size(), kernel.height, kernel.width,
-            //        teg_bias, stride.height, stride.width,
-            //        pads_begin[0], pads_end[0], pads_begin[1], pads_end[1], dilation.height, dilation.width,
-            //        weightsMat.step1(), padMode.c_str() ,tengine_graph);
-        }
-        if(NULL != tengine_graph)
-        {
-            tengine_ret = tengine_forward(tengine_graph);
-        }
-        /* activation */
-        if((true == tengine_ret) && activ )
-        {
-            int out_cstep = out_h * out_w;	    // out_cstep
-
-            ActivationLayer* activ_ = activ.get();
-            activ_->forwardSlice(output_, output_, out_cstep, out_cstep, 0, outch);
-        }
-        if(false == tengine_ret)
-#endif
         {
             int nstripes = std::max(getNumThreads(), 1);
             int conv_dim = CONV_2D;
diff --git a/modules/dnn/src/layers/cpu_kernels/convolution.hpp b/modules/dnn/src/layers/cpu_kernels/convolution.hpp
index 22ef9a8575..5effdc2d0c 100644
--- a/modules/dnn/src/layers/cpu_kernels/convolution.hpp
+++ b/modules/dnn/src/layers/cpu_kernels/convolution.hpp
@@ -14,7 +14,7 @@
 #define CONV_NR_FP32 28
 
 // The FP16 can only be supported by ARM64 and with FP16 FMA supported.
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC // check FP16 FMA.
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && CV_FP16 // check FP16 FMA.
 #define CONV_ARM_FP16 1
 #endif
 
diff --git a/modules/dnn/src/layers/crop_and_resize_layer.cpp b/modules/dnn/src/layers/crop_and_resize_layer.cpp
index eb8822870f..a6f58f8983 100644
--- a/modules/dnn/src/layers/crop_and_resize_layer.cpp
+++ b/modules/dnn/src/layers/crop_and_resize_layer.cpp
@@ -133,7 +133,7 @@ public:
         auto input = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
         auto rois = nodes[1].dynamicCast<InfEngineNgraphNode>()->node;
 
-        auto rois_shape = rois->get_shape();
+        auto rois_shape = rois.get_shape();
         std::vector<int64_t> dims(rois_shape.begin(), rois_shape.end()), offsets(4, 0);
         offsets[3] = 2;
         dims[3] = 7;
diff --git a/modules/dnn/src/layers/elementwise_layers.cpp b/modules/dnn/src/layers/elementwise_layers.cpp
index 3bcd53f95c..4247511879 100644
--- a/modules/dnn/src/layers/elementwise_layers.cpp
+++ b/modules/dnn/src/layers/elementwise_layers.cpp
@@ -490,7 +490,7 @@ struct ReLUFunctor : public BaseFunctor
 #endif
 
 #ifdef HAVE_DNN_NGRAPH
-    std::shared_ptr<ngraph::Node> initNgraphAPI(const std::shared_ptr<ngraph::Node>& node)
+    std::shared_ptr<ngraph::Node> initNgraphAPI(const ngraph::Output<ngraph::Node>& node)
     {
         if (slope) {
             auto param = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, &slope);
@@ -674,7 +674,7 @@ struct ReLU6Functor : public BaseFunctor
 
 
 #ifdef HAVE_DNN_NGRAPH
-    std::shared_ptr<ngraph::Node> initNgraphAPI(const std::shared_ptr<ngraph::Node>& node)
+    std::shared_ptr<ngraph::Node> initNgraphAPI(const ngraph::Output<ngraph::Node>& node)
     {
         return std::make_shared<ngraph::op::Clamp>(node, minValue, maxValue);
     }
@@ -796,7 +796,7 @@ struct BaseDefaultFunctor : public BaseFunctor
 #endif // HAVE_CANN
 
 #ifdef HAVE_DNN_NGRAPH
-    std::shared_ptr<ngraph::Node> initNgraphAPI(const std::shared_ptr<ngraph::Node>& node)
+    std::shared_ptr<ngraph::Node> initNgraphAPI(const ngraph::Output<ngraph::Node>& node)
     {
         CV_Error(Error::StsNotImplemented, "");
     }
@@ -929,7 +929,7 @@ struct TanHFunctor : public BaseDefaultFunctor<TanHFunctor>
 #endif // HAVE_CANN
 
 #ifdef HAVE_DNN_NGRAPH
-    std::shared_ptr<ngraph::Node> initNgraphAPI(const std::shared_ptr<ngraph::Node>& node)
+    std::shared_ptr<ngraph::Node> initNgraphAPI(const ngraph::Output<ngraph::Node>& node)
     {
         return std::make_shared<ngraph::op::Tanh>(node);
     }
@@ -998,7 +998,7 @@ struct SwishFunctor : public BaseDefaultFunctor<SwishFunctor>
 #endif // HAVE_CANN
 
 #ifdef HAVE_DNN_NGRAPH
-    std::shared_ptr<ngraph::Node> initNgraphAPI(const std::shared_ptr<ngraph::Node>& node)
+    std::shared_ptr<ngraph::Node> initNgraphAPI(const ngraph::Output<ngraph::Node>& node)
     {
         auto sigmoid = std::make_shared<ngraph::op::Sigmoid>(node);
         return std::make_shared<ngraph::op::v1::Multiply>(node, sigmoid);
@@ -1074,7 +1074,7 @@ struct MishFunctor : public BaseDefaultFunctor<MishFunctor>
 #endif // HAVE_CANN
 
 #ifdef HAVE_DNN_NGRAPH
-    std::shared_ptr<ngraph::Node> initNgraphAPI(const std::shared_ptr<ngraph::Node>& node)
+    std::shared_ptr<ngraph::Node> initNgraphAPI(const ngraph::Output<ngraph::Node>& node)
     {
         float one = 1.0f;
         auto constant = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, &one);
@@ -1157,7 +1157,7 @@ struct SigmoidFunctor : public BaseDefaultFunctor<SigmoidFunctor>
 #endif // HAVE_CANN
 
 #ifdef HAVE_DNN_NGRAPH
-    std::shared_ptr<ngraph::Node> initNgraphAPI(const std::shared_ptr<ngraph::Node>& node)
+    std::shared_ptr<ngraph::Node> initNgraphAPI(const ngraph::Output<ngraph::Node>& node)
     {
         return std::make_shared<ngraph::op::Sigmoid>(node);
     }
@@ -1237,7 +1237,7 @@ struct ELUFunctor : public BaseDefaultFunctor<ELUFunctor>
 #endif // HAVE_CANN
 
 #ifdef HAVE_DNN_NGRAPH
-    std::shared_ptr<ngraph::Node> initNgraphAPI(const std::shared_ptr<ngraph::Node>& node)
+    std::shared_ptr<ngraph::Node> initNgraphAPI(const ngraph::Output<ngraph::Node>& node)
     {
         return std::make_shared<ngraph::op::Elu>(node, alpha);
     }
@@ -1307,7 +1307,7 @@ struct AbsValFunctor : public BaseDefaultFunctor<AbsValFunctor>
 #endif // HAVE_CANN
 
 #ifdef HAVE_DNN_NGRAPH
-    std::shared_ptr<ngraph::Node> initNgraphAPI(const std::shared_ptr<ngraph::Node>& node)
+    std::shared_ptr<ngraph::Node> initNgraphAPI(const ngraph::Output<ngraph::Node>& node)
     {
         float coeff = -0.999999f;
         // float coeff = preferableTarget == DNN_TARGET_MYRIAD ? -0.999f : -0.999999f;
@@ -1603,7 +1603,7 @@ struct SqrtFunctor : public BaseDefaultFunctor<SqrtFunctor>
 #endif  // HAVE_HALIDE
 
 #ifdef HAVE_DNN_NGRAPH
-    std::shared_ptr<ngraph::Node> initNgraphAPI(const std::shared_ptr<ngraph::Node>& node)
+    std::shared_ptr<ngraph::Node> initNgraphAPI(const ngraph::Output<ngraph::Node>& node)
     {
         return std::make_shared<ngraph::op::v0::Sqrt>(node);
     }
@@ -2329,7 +2329,7 @@ struct PowerFunctor : public BaseFunctor
 #endif // HAVE_CANN
 
 #ifdef HAVE_DNN_NGRAPH
-    std::shared_ptr<ngraph::Node> initNgraphAPI(const std::shared_ptr<ngraph::Node>& node)
+    std::shared_ptr<ngraph::Node> initNgraphAPI(const ngraph::Output<ngraph::Node>& node)
     {
         auto scale_node = std::make_shared<ngraph::op::Constant>(ngraph::element::f32,
                                                                  ngraph::Shape{1}, &scale);
@@ -2439,7 +2439,7 @@ struct ExpFunctor : public BaseDefaultFunctor<ExpFunctor>
 #endif  // HAVE_HALIDE
 
 #ifdef HAVE_DNN_NGRAPH
-    std::shared_ptr<ngraph::Node> initNgraphAPI(const std::shared_ptr<ngraph::Node>& node)
+    std::shared_ptr<ngraph::Node> initNgraphAPI(const ngraph::Output<ngraph::Node>& node)
     {
         auto scale_node = std::make_shared<ngraph::op::Constant>(ngraph::element::f32,
                                                                  ngraph::Shape{1}, &normScale);
@@ -2598,7 +2598,7 @@ struct ChannelsPReLUFunctor : public BaseFunctor
 #endif // HAVE_CANN
 
 #ifdef HAVE_DNN_NGRAPH
-    std::shared_ptr<ngraph::Node> initNgraphAPI(const std::shared_ptr<ngraph::Node>& node)
+    std::shared_ptr<ngraph::Node> initNgraphAPI(const ngraph::Output<ngraph::Node>& node)
     {
         const size_t numChannels = scale.total();
         auto slope = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{numChannels}, scale.data);
@@ -2678,7 +2678,7 @@ struct PReLUFunctor : public ChannelsPReLUFunctor
     }
 
 #ifdef HAVE_DNN_NGRAPH
-    std::shared_ptr<ngraph::Node> initNgraphAPI(const std::shared_ptr<ngraph::Node>& node)
+    std::shared_ptr<ngraph::Node> initNgraphAPI(const ngraph::Output<ngraph::Node>& node)
     {
         auto shape = getShape<size_t>(scale);
         auto slope = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, shape, scale.ptr<float>());
diff --git a/modules/dnn/src/layers/eltwise_layer.cpp b/modules/dnn/src/layers/eltwise_layer.cpp
index 8ed1b799eb..49b3c02de3 100644
--- a/modules/dnn/src/layers/eltwise_layer.cpp
+++ b/modules/dnn/src/layers/eltwise_layer.cpp
@@ -896,12 +896,14 @@ public:
     virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
                                         const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
     {
+        CV_Assert(nodes.size() >= 2);
         auto curr_node = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
         if (!coeffs.empty()) {
             auto coeff = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, &coeffs[0]);
             curr_node = std::make_shared<ngraph::op::v1::Multiply>(curr_node, coeff, ngraph::op::AutoBroadcastType::NUMPY);
         }
 
+        std::shared_ptr<ngraph::Node> res;
         for (size_t i = 1; i < nodes.size(); i++)
         {
             auto next_node = nodes[i].dynamicCast<InfEngineNgraphNode>()->node;
@@ -910,15 +912,16 @@ public:
                 next_node = std::make_shared<ngraph::op::v1::Multiply>(next_node, coeff, ngraph::op::AutoBroadcastType::NUMPY);
             }
             switch (op) {
-                case SUM:  curr_node = std::make_shared<ngraph::op::v1::Add>(curr_node, next_node); break;
-                case PROD: curr_node = std::make_shared<ngraph::op::v1::Multiply>(curr_node, next_node); break;
-                case DIV:  curr_node = std::make_shared<ngraph::op::v1::Divide>(curr_node, next_node); break;
-                case MAX:  curr_node = std::make_shared<ngraph::op::v1::Maximum>(curr_node, next_node); break;
-                case MIN:  curr_node = std::make_shared<ngraph::op::v1::Minimum>(curr_node, next_node); break;
+                case SUM:  res = std::make_shared<ngraph::op::v1::Add>(curr_node, next_node); break;
+                case PROD: res = std::make_shared<ngraph::op::v1::Multiply>(curr_node, next_node); break;
+                case DIV:  res = std::make_shared<ngraph::op::v1::Divide>(curr_node, next_node); break;
+                case MAX:  res = std::make_shared<ngraph::op::v1::Maximum>(curr_node, next_node); break;
+                case MIN:  res = std::make_shared<ngraph::op::v1::Minimum>(curr_node, next_node); break;
                 default: CV_Error(Error::StsNotImplemented, "Unsupported eltwise operation");
             }
+            curr_node = res;
         }
-        return Ptr<BackendNode>(new InfEngineNgraphNode(curr_node));
+        return Ptr<BackendNode>(new InfEngineNgraphNode(res));
     }
 #endif  // HAVE_DNN_NGRAPH
 
diff --git a/modules/dnn/src/layers/flatten_layer.cpp b/modules/dnn/src/layers/flatten_layer.cpp
index 6a502af7e9..9ff3bec38b 100644
--- a/modules/dnn/src/layers/flatten_layer.cpp
+++ b/modules/dnn/src/layers/flatten_layer.cpp
@@ -209,7 +209,7 @@ public:
                                         const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
     {
         auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
-        std::vector<size_t> dims = ieInpNode->get_shape();
+        std::vector<size_t> dims = ieInpNode.get_shape();
 
         int numAxes = dims.size();
         int startAxis = normalize_axis(_startAxis, numAxes);
diff --git a/modules/dnn/src/layers/fully_connected_layer.cpp b/modules/dnn/src/layers/fully_connected_layer.cpp
index 9cdb31023c..f03af7c1fb 100644
--- a/modules/dnn/src/layers/fully_connected_layer.cpp
+++ b/modules/dnn/src/layers/fully_connected_layer.cpp
@@ -803,7 +803,7 @@ public:
         }
         else
         {
-            std::vector<int> shape(1 + normalize_axis(axis, ieInpNode->get_shape().size()), 0);
+            std::vector<int> shape(1 + normalize_axis(axis, ieInpNode.get_shape().size()), 0);
             shape[shape.size() - 1] = -1;
             auto inp = std::make_shared<ngraph::op::v1::Reshape>(
                 ieInpNode,
diff --git a/modules/dnn/src/layers/lrn_layer.cpp b/modules/dnn/src/layers/lrn_layer.cpp
index 61c2224e36..f8de64cb32 100644
--- a/modules/dnn/src/layers/lrn_layer.cpp
+++ b/modules/dnn/src/layers/lrn_layer.cpp
@@ -480,7 +480,7 @@ public:
         if (type != SPATIAL_NRM) {
             axes = {1};
         } else {
-            axes.resize(ieInpNode->get_shape().size() - 2);
+            axes.resize(ieInpNode.get_shape().size() - 2);
             std::iota(axes.begin(), axes.end(), 2);
         }
         auto ngraph_axes = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{axes.size()}, axes.data());
diff --git a/modules/dnn/src/layers/max_unpooling_layer.cpp b/modules/dnn/src/layers/max_unpooling_layer.cpp
index 6a599408e1..7ed6c64ae8 100644
--- a/modules/dnn/src/layers/max_unpooling_layer.cpp
+++ b/modules/dnn/src/layers/max_unpooling_layer.cpp
@@ -194,7 +194,7 @@ public:
         std::vector<MatShape> inpShapes(nodes.size());
         std::vector<MatShape> outShapes, internals;
         for (int i = 0; i < nodes.size(); ++i) {
-            std::vector<size_t> shape = nodes[i].dynamicCast<InfEngineNgraphNode>()->node->get_shape();
+            std::vector<size_t> shape = nodes[i].dynamicCast<InfEngineNgraphNode>()->node.get_shape();
             inpShapes[i] = std::vector<int>(shape.begin(), shape.end());
         }
         getMemoryShapes(inpShapes, 1, outShapes, internals);
@@ -213,7 +213,7 @@ public:
             std::make_shared<ngraph::op::Constant>(ngraph::element::i32, ngraph::Shape{1}, &newShape),
             true
         );
-        if (indices->get_element_type() != ngraph::element::i32 && indices->get_element_type() != ngraph::element::i64) {
+        if (indices.get_element_type() != ngraph::element::i32 && indices.get_element_type() != ngraph::element::i64) {
             indices = std::make_shared<ngraph::op::Convert>(indices, ngraph::element::i64);
         }
 
diff --git a/modules/dnn/src/layers/mvn_layer.cpp b/modules/dnn/src/layers/mvn_layer.cpp
index dc23656b7a..aae53fa327 100644
--- a/modules/dnn/src/layers/mvn_layer.cpp
+++ b/modules/dnn/src/layers/mvn_layer.cpp
@@ -390,7 +390,7 @@ public:
         auto mvn = std::make_shared<ngraph::op::MVN>(ieInpNode, acrossChannels, normVariance, eps);
 #else
         int64_t start_axis = acrossChannels ? 1 : 2;
-        std::vector<int64_t> axes_v(ieInpNode->get_shape().size() - start_axis);
+        std::vector<int64_t> axes_v(ieInpNode.get_shape().size() - start_axis);
         std::iota(axes_v.begin(), axes_v.end(), start_axis);
         auto axes = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{axes_v.size()}, axes_v.data());
         auto mvn = std::make_shared<ngraph::op::v6::MVN>(ieInpNode, axes, normVariance, eps, ngraph::op::MVNEpsMode::INSIDE_SQRT);
diff --git a/modules/dnn/src/layers/nary_eltwise_layers.cpp b/modules/dnn/src/layers/nary_eltwise_layers.cpp
index fadbf58244..8572eee995 100644
--- a/modules/dnn/src/layers/nary_eltwise_layers.cpp
+++ b/modules/dnn/src/layers/nary_eltwise_layers.cpp
@@ -900,12 +900,12 @@ public:
         auto& inp0 = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
         auto& inp1 = nodes[1].dynamicCast<InfEngineNgraphNode>()->node;
 
-        if (inp0->get_element_type() != inp1->get_element_type()) {
+        if (inp0.get_element_type() != inp1.get_element_type()) {
             auto dtype = preferableTarget == DNN_TARGET_OPENCL_FP16 || preferableTarget == DNN_TARGET_MYRIAD ?
                         ngraph::element::f16 : ngraph::element::f32;
-            if (inp0->get_element_type() != dtype)
+            if (inp0.get_element_type() != dtype)
                 inp0 = std::make_shared<ngraph::op::v0::Convert>(inp0, dtype);
-            if (inp1->get_element_type() != dtype)
+            if (inp1.get_element_type() != dtype)
                 inp1 = std::make_shared<ngraph::op::v0::Convert>(inp1, dtype);
         }
 
diff --git a/modules/dnn/src/layers/normalize_bbox_layer.cpp b/modules/dnn/src/layers/normalize_bbox_layer.cpp
index f0ad6e6f61..431eeab82d 100644
--- a/modules/dnn/src/layers/normalize_bbox_layer.cpp
+++ b/modules/dnn/src/layers/normalize_bbox_layer.cpp
@@ -273,21 +273,21 @@ public:
                                         const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
     {
         auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
-        const size_t batch = ieInpNode->get_shape()[0];
-        const size_t numChannels = ieInpNode->get_shape()[1];
+        const size_t batch = ieInpNode.get_shape()[0];
+        const size_t numChannels = ieInpNode.get_shape()[1];
 
         std::vector<int64_t> axes_data;
         if (!acrossSpatial) {
             axes_data.push_back(1);
         } else {
-            axes_data.resize(ieInpNode->get_shape().size() - 1);
+            axes_data.resize(ieInpNode.get_shape().size() - 1);
             std::iota(axes_data.begin(), axes_data.end(), 1);
         }
         auto axes = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{axes_data.size()}, axes_data);
         auto norm = std::make_shared<ngraph::op::v0::NormalizeL2>(ieInpNode, axes, epsilon, ngraph::op::EpsMode::ADD);
 
         CV_Assert(blobs.empty() || numChannels == blobs[0].total());
-        std::vector<size_t> shape(ieInpNode->get_shape().size(), 1);
+        std::vector<size_t> shape(ieInpNode.get_shape().size(), 1);
         shape[0] = blobs.empty() ? 1 : batch;
         shape[1] = numChannels;
         if (!blobs.empty())
diff --git a/modules/dnn/src/layers/pooling_layer.cpp b/modules/dnn/src/layers/pooling_layer.cpp
index 5caaa36ba0..a75382d8a5 100644
--- a/modules/dnn/src/layers/pooling_layer.cpp
+++ b/modules/dnn/src/layers/pooling_layer.cpp
@@ -209,7 +209,8 @@ public:
 #ifdef HAVE_INF_ENGINE
         if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
         {
-            return type != STOCHASTIC && kernel_size.size() > 1 && (kernel_size.size() != 3 || !isArmComputePlugin());
+            return type != STOCHASTIC && kernel_size.size() > 1 && (kernel_size.size() != 3 || !isArmComputePlugin()) &&
+                   (!computeMaxIdx || INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2022_1));
         }
 #endif
         if (backendId == DNN_BACKEND_OPENCV)
@@ -600,7 +601,7 @@ public:
             return Ptr<BackendNode>(new InfEngineNgraphNode(ave_pool));
         }
         else if (type == SUM) {
-            ngraph::Shape inpShape = ieInpNode->get_shape();
+            ngraph::Shape inpShape = ieInpNode.get_shape();
             CV_Assert(inpShape.size() == 2 + kernel_size.size());
             std::vector<int64_t> axes;
             for (size_t i = 0; i < kernel_size.size(); i++)
@@ -615,10 +616,14 @@ public:
         else if (type == MAX) {
             std::shared_ptr<ngraph::Node> max_pool;
             if (computeMaxIdx) {
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2022_1)
                 std::vector<size_t> dilations(kernel_size.size(), 1);
                 max_pool = std::make_shared<ngraph::op::v8::MaxPool>(ieInpNode, ngraph::Strides(strides), ngraph::Strides(dilations),
                                 ngraph::Shape(pads_begin), ngraph::Shape(pads_end), ngraph::Shape(kernel_size),
                                 rounding_type, pad_type);
+#else
+                CV_Error(Error::StsNotImplemented, "OpenVINO MaxPool with indices");
+#endif
             } else {
                 max_pool = std::make_shared<ngraph::op::v1::MaxPool>(ieInpNode, ngraph::Strides(strides),
                                 ngraph::Shape(pads_begin), ngraph::Shape(pads_end), ngraph::Shape(kernel_size),
diff --git a/modules/dnn/src/layers/proposal_layer.cpp b/modules/dnn/src/layers/proposal_layer.cpp
index e9edcf1547..2f2a33cc6f 100644
--- a/modules/dnn/src/layers/proposal_layer.cpp
+++ b/modules/dnn/src/layers/proposal_layer.cpp
@@ -366,10 +366,10 @@ public:
         auto& class_logits = nodes[1].dynamicCast<InfEngineNgraphNode>()->node;
         auto& image_shape  = nodes[2].dynamicCast<InfEngineNgraphNode>()->node;
 
-        CV_Assert_N(image_shape->get_shape().size() == 2, image_shape->get_shape().front() == 1);
+        CV_Assert_N(image_shape.get_shape().size() == 2, image_shape.get_shape().front() == 1);
         auto shape   = std::make_shared<ngraph::op::Constant>(ngraph::element::i64,
                        ngraph::Shape{1},
-                       std::vector<int64_t>{(int64_t)image_shape->get_shape().back()});
+                       std::vector<int64_t>{(int64_t)image_shape.get_shape().back()});
         auto reshape = std::make_shared<ngraph::op::v1::Reshape>(image_shape, shape, true);
 
         auto proposal = std::make_shared<ngraph::op::Proposal>(class_probs, class_logits, reshape, attr);
diff --git a/modules/dnn/src/layers/region_layer.cpp b/modules/dnn/src/layers/region_layer.cpp
index 7ab8cdd93f..49952b4c83 100644
--- a/modules/dnn/src/layers/region_layer.cpp
+++ b/modules/dnn/src/layers/region_layer.cpp
@@ -466,7 +466,7 @@ public:
                                         const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
     {
         auto& input = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
-        auto parent_shape = input->get_shape();
+        auto parent_shape = input.get_shape();
         int64_t b = parent_shape[0];
         int64_t h = parent_shape[1];
         int64_t w = parent_shape[2];
@@ -567,7 +567,7 @@ public:
             int hNorm, wNorm;
             if (nodes.size() > 1)
             {
-                auto node_1_shape = nodes[1].dynamicCast<InfEngineNgraphNode>()->node->get_shape();
+                auto node_1_shape = nodes[1].dynamicCast<InfEngineNgraphNode>()->node.get_shape();
                 hNorm = node_1_shape[2];
                 wNorm = node_1_shape[3];
             }
diff --git a/modules/dnn/src/layers/resize_layer.cpp b/modules/dnn/src/layers/resize_layer.cpp
index 607adb8aa1..fe27748319 100644
--- a/modules/dnn/src/layers/resize_layer.cpp
+++ b/modules/dnn/src/layers/resize_layer.cpp
@@ -443,7 +443,7 @@ public:
         std::vector<int64_t> shape = {outHeight, outWidth};
         auto out_shape = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{2}, shape.data());
 
-        auto& input_shape = ieInpNode->get_shape();
+        auto& input_shape = ieInpNode.get_shape();
         CV_Assert_N(input_shape[2] != 0, input_shape[3] != 0);
         std::vector<float> scales = {static_cast<float>(outHeight) / input_shape[2], static_cast<float>(outWidth) / input_shape[3]};
         auto scales_shape = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{2}, scales.data());
diff --git a/modules/dnn/src/layers/scale_layer.cpp b/modules/dnn/src/layers/scale_layer.cpp
index 5338ab2215..2a4e1a05d5 100644
--- a/modules/dnn/src/layers/scale_layer.cpp
+++ b/modules/dnn/src/layers/scale_layer.cpp
@@ -331,34 +331,36 @@ public:
     virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs, const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
     {
         auto ieInpNode0 = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
-        auto ieInpNode1 = nodes.size() > 1 ? nodes[1].dynamicCast<InfEngineNgraphNode>()->node : nullptr;
+        ngraph::Output<ngraph::Node> ieInpNode1;
+        if (nodes.size() > 1)
+            ieInpNode1 = nodes[1].dynamicCast<InfEngineNgraphNode>()->node;
 
         size_t numChannels = 1;
         if (blobs.empty())
-            for (const size_t& dim : ieInpNode1->get_shape())
+            for (const size_t& dim : ieInpNode1.get_shape())
                 numChannels *= dim;
         else
             numChannels = blobs[0].total();
 
-        std::vector<size_t> shape(ieInpNode0->get_shape().size(), 1);
+        std::vector<size_t> shape(ieInpNode0.get_shape().size(), 1);
         int cAxis = normalize_axis(axis, shape.size());
         shape[cAxis] = numChannels;
 
-        auto node = ieInpNode0;
+        std::shared_ptr<ngraph::Node> node;
         if (hasWeights)
         {
-            auto weight = blobs.empty() ? ieInpNode1 :
+            ngraph::Output<ngraph::Node> weight = blobs.empty() ? ieInpNode1 :
                           std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape(shape), blobs[0].data);
 
 #if INF_ENGINE_VER_MAJOR_GT(INF_ENGINE_RELEASE_2021_2)
-            node = std::make_shared<ngraph::op::v1::Multiply>(node, weight, ngraph::op::AutoBroadcastType::NUMPY);
+            node = std::make_shared<ngraph::op::v1::Multiply>(ieInpNode0, weight, ngraph::op::AutoBroadcastType::NUMPY);
 #else
-            node = std::make_shared<ngraph::op::v0::Multiply>(node, weight, ngraph::op::AutoBroadcastType::NUMPY);
+            node = std::make_shared<ngraph::op::v0::Multiply>(ieInpNode0, weight, ngraph::op::AutoBroadcastType::NUMPY);
 #endif
         }
         if (hasBias || !hasWeights)
         {
-            std::shared_ptr<ngraph::Node> bias;
+            ngraph::Output<ngraph::Node> bias;
             if (hasBias)
             {
                 bias = blobs.empty() ? ieInpNode1 :
diff --git a/modules/dnn/src/layers/slice_layer.cpp b/modules/dnn/src/layers/slice_layer.cpp
index d3675e23a5..c44d18182e 100644
--- a/modules/dnn/src/layers/slice_layer.cpp
+++ b/modules/dnn/src/layers/slice_layer.cpp
@@ -759,7 +759,7 @@ public:
     {
         CV_Assert_N(nodes.size() <= 2);
         auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
-        CV_Assert(finalSliceRanges[0].size() == ieInpNode->get_shape().size());
+        CV_Assert(finalSliceRanges[0].size() == ieInpNode.get_shape().size());
 
         std::vector<int64_t> offsets, dims;
         for (int i = 0; i < finalSliceRanges[0].size(); ++i)
diff --git a/modules/dnn/src/layers/softmax_layer.cpp b/modules/dnn/src/layers/softmax_layer.cpp
index b74f2b6791..faab6a565f 100644
--- a/modules/dnn/src/layers/softmax_layer.cpp
+++ b/modules/dnn/src/layers/softmax_layer.cpp
@@ -385,7 +385,7 @@ public:
                                         const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
     {
         auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
-        int axis = normalize_axis(axisRaw, ieInpNode->get_shape().size());
+        int axis = normalize_axis(axisRaw, ieInpNode.get_shape().size());
         auto softmax = std::make_shared<ngraph::op::v1::Softmax>(ieInpNode, axis);
         if (logSoftMax)
             return Ptr<BackendNode>(new InfEngineNgraphNode(std::make_shared<ngraph::op::v0::Log>(softmax)));
diff --git a/modules/dnn/src/net_impl_fuse.cpp b/modules/dnn/src/net_impl_fuse.cpp
index 4570d2b360..dfa542bd41 100644
--- a/modules/dnn/src/net_impl_fuse.cpp
+++ b/modules/dnn/src/net_impl_fuse.cpp
@@ -210,7 +210,7 @@ void Net::Impl::fuseLayers(const std::vector<LayerPin>& blobsToKeep_)
                 if (!nextData->params.has("operation") || toLowerCase(nextData->params.get<String>("operation")) != "add")
                 {
                     CV_LOG_DEBUG(NULL, "DNN/CPU: fusion with NaryEltwise or Eltwise Layer operation is not supported: "
-                        << nextData->params.get<String>("operation"));
+                        << toLowerCase(nextData->params.get<String>("operation", "sum")));
                     break;
                 }
 
diff --git a/modules/dnn/src/net_openvino.cpp b/modules/dnn/src/net_openvino.cpp
index e974ce34a3..c274f44a87 100644
--- a/modules/dnn/src/net_openvino.cpp
+++ b/modules/dnn/src/net_openvino.cpp
@@ -252,7 +252,7 @@ void NetImplOpenVINO::addNgraphOutputs(LayerData& ld)
             CV_Assert(!ieInpNode->net.empty());
             if (layerNet != ieInpNode->net)
             {
-                CV_LOG_DEBUG(NULL, "DNN/IE: pin output between subnets: " << ieInpNode->node->get_friendly_name());
+                CV_LOG_DEBUG(NULL, "DNN/IE: pin output between subnets: " << ieInpNode->node.get_node()->get_friendly_name());
                 ieInpNode->net->addOutput(ieInpNode);
             }
         }
@@ -321,8 +321,10 @@ void NetImplOpenVINO::initBackend(const std::vector<LayerPin>& blobsToKeep_)
         return;
     }
 
+#if INF_ENGINE_VER_MAJOR_LT(INF_ENGINE_RELEASE_2022_1)
     bool supportsCPUFallback = !isArmComputePlugin() && (preferableTarget == DNN_TARGET_CPU ||
                                openvino::checkTarget(DNN_TARGET_CPU));
+#endif
 
     // Build Inference Engine networks from sets of layers that support this
     // backend. Split a whole model on several Inference Engine networks if
@@ -341,6 +343,10 @@ void NetImplOpenVINO::initBackend(const std::vector<LayerPin>& blobsToKeep_)
 
         bool fused = ld.skip;
         Ptr<Layer> layer = ld.layerInstance;
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2022_1)
+        if (ld.id == 0)
+            continue;
+#else
         if (!fused && !layer->supportBackend(preferableBackend))
         {
             CV_LOG_DEBUG(NULL, "DNN/IE:    NOT supported!");
@@ -355,17 +361,6 @@ void NetImplOpenVINO::initBackend(const std::vector<LayerPin>& blobsToKeep_)
                 }
             }
 
-            // TODO: fix these workarounds
-            if (preferableTarget == DNN_TARGET_MYRIAD ||
-                preferableTarget == DNN_TARGET_HDDL ||
-                preferableTarget == DNN_TARGET_OPENCL ||
-                preferableTarget == DNN_TARGET_OPENCL_FP16)
-                customizable &= ld.type != "Concat";
-
-            if (preferableTarget == DNN_TARGET_OPENCL ||
-                preferableTarget == DNN_TARGET_OPENCL_FP16)
-                customizable &= ld.type != "Power";
-
             if (preferableTarget == DNN_TARGET_OPENCL)
                 customizable &= ld.type != "Eltwise";
 
@@ -390,6 +385,7 @@ void NetImplOpenVINO::initBackend(const std::vector<LayerPin>& blobsToKeep_)
                 continue;
             }
         }
+#endif
         ld.skip = true;  // Initially skip all Inference Engine supported layers.
 
         // Create a new network if one of inputs from different Inference Engine graph.
@@ -478,7 +474,7 @@ void NetImplOpenVINO::initBackend(const std::vector<LayerPin>& blobsToKeep_)
                 int oid = ld.inputBlobsId[i].oid;
 
                 auto ieInpNode = inputNodes[i].dynamicCast<InfEngineNgraphNode>();
-                const auto& ngraph_input_node = ieInpNode->node;
+                const auto& ngraph_input_node = ieInpNode->node.get_node_shared_ptr();
                 CV_LOG_DEBUG(NULL, "DNN/IE: bind output port " << lid << ":" << oid << " (" << ngraph_input_node->get_friendly_name() << ":" << ngraph_input_node->get_type_info().name << ")");
 
                 if ((oid == 0 && ngraph_input_node->get_output_size() == 1) || lid == 0)
@@ -498,10 +494,7 @@ void NetImplOpenVINO::initBackend(const std::vector<LayerPin>& blobsToKeep_)
                 }
                 CV_CheckLT((size_t)oid, ngraph_input_node->get_output_size(), "");
 #if INF_ENGINE_VER_MAJOR_GT(INF_ENGINE_RELEASE_2020_4)
-                // FIXIT refactor ".initNgraph()" API to use Output<Node>
-                // WA: use Concat to emulate Identity operation with requested output port
-                auto oid_node = std::make_shared<ngraph::op::Concat>(ngraph::OutputVector { ngraph_input_node->output(oid) }, 0);
-                inputNodes[i] = Ptr<BackendNode>(new InfEngineNgraphNode(oid_node));
+                inputNodes[i] = new InfEngineNgraphNode(ngraph_input_node->output(oid));
 #elif INF_ENGINE_VER_MAJOR_GT(INF_ENGINE_RELEASE_2020_3)
                 inputNodes[i] = Ptr<BackendNode>(new InfEngineNgraphNode(ieInpNode->node->get_output_as_single_output_node(oid)));
 #else
@@ -556,6 +549,36 @@ void NetImplOpenVINO::initBackend(const std::vector<LayerPin>& blobsToKeep_)
         addNgraphOutputs(ld);
     }
 
+    // User may choose to return only intermediate blobs but not network's result (see Test_TFLite.max_unpooling)
+    // Such layers should not be skipped when forwardLayer is called.
+    // Also, perform a sanity check that there is no double inferred networks (a single skip=false per unique net instance)
+    std::set<Ptr<InfEngineNgraphNet>> uniqueNets;
+    if (!blobsToKeep_.empty())
+    {
+        LayerPin latestLayerPin = getLatestLayerPin(blobsToKeep_);
+        for (MapIdToLayerData::iterator it = layers.begin(); it != layers.end(); ++it)
+        {
+            LayerData& ld = it->second;
+            auto iter = ld.backendNodes.find(preferableBackend);
+            if (iter == ld.backendNodes.end())
+                continue;
+
+            Ptr<BackendNode>& node = iter->second;
+            if (node.empty())
+                continue;
+
+            Ptr<InfEngineNgraphNode> ieNode = node.dynamicCast<InfEngineNgraphNode>();
+            if (ieNode.empty())
+                continue;
+
+            if (ld.id == latestLayerPin.lid) {
+                ld.skip = false;
+                uniqueNets.insert(ieNode->net);
+                break;
+            }
+        }
+    }
+
     // Initialize all networks.
     for (MapIdToLayerData::reverse_iterator it = layers.rbegin(); it != layers.rend(); ++it)
     {
@@ -578,9 +601,15 @@ void NetImplOpenVINO::initBackend(const std::vector<LayerPin>& blobsToKeep_)
         {
             ieNode->net->addOutput(ieNode);
             ieNode->net->createNet((Target)preferableTarget);
-            ld.skip = false;
+            if (uniqueNets.find(ieNode->net) == uniqueNets.end()) {
+                ld.skip = false;
+                uniqueNets.insert(ieNode->net);
+            }
         }
     }
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2022_1)
+    CV_Assert(uniqueNets.size() == 1);
+#endif
 }
 
 
diff --git a/modules/dnn/src/op_halide.cpp b/modules/dnn/src/op_halide.cpp
index 653de36146..db1a72278e 100644
--- a/modules/dnn/src/op_halide.cpp
+++ b/modules/dnn/src/op_halide.cpp
@@ -14,6 +14,7 @@
 #include "halide_scheduler.hpp"
 
 #include <HalideRuntimeOpenCL.h>
+#include <thread>
 #endif  // HAVE_HALIDE
 
 namespace cv {
diff --git a/modules/dnn/src/opencl/gemm_buffer.cl b/modules/dnn/src/opencl/gemm_buffer.cl
index b345983aee..70028b0eec 100644
--- a/modules/dnn/src/opencl/gemm_buffer.cl
+++ b/modules/dnn/src/opencl/gemm_buffer.cl
@@ -453,14 +453,14 @@ __kernel void TEMPLATE(gemm_buffer_NT, Dtype)(
     int w;
     for(int b_tile = 0; b_tile < K; b_tile += SLM_BLOCK) {
         barrier(CLK_LOCAL_MEM_FENCE);
-        vstore4(vload4(0, (__global float *)(src1_read0 + mad24(0, K, local_index))), 0, (__local float *)(slm_brow + mad24(0, SLM_BLOCK, local_index)));
-        vstore4(vload4(0, (__global float *)(src1_read0 + mad24(1, K, local_index))), 0, (__local float *)(slm_brow + mad24(1, SLM_BLOCK, local_index)));
-        vstore4(vload4(0, (__global float *)(src1_read0 + mad24(2, K, local_index))), 0, (__local float *)(slm_brow + mad24(2, SLM_BLOCK, local_index)));
-        vstore4(vload4(0, (__global float *)(src1_read0 + mad24(3, K, local_index))), 0, (__local float *)(slm_brow + mad24(3, SLM_BLOCK, local_index)));
-        vstore4(vload4(0, (__global float *)(src1_read0 + mad24(4, K, local_index))), 0, (__local float *)(slm_brow + mad24(4, SLM_BLOCK, local_index)));
-        vstore4(vload4(0, (__global float *)(src1_read0 + mad24(5, K, local_index))), 0, (__local float *)(slm_brow + mad24(5, SLM_BLOCK, local_index)));
-        vstore4(vload4(0, (__global float *)(src1_read0 + mad24(6, K, local_index))), 0, (__local float *)(slm_brow + mad24(6, SLM_BLOCK, local_index)));
-        vstore4(vload4(0, (__global float *)(src1_read0 + mad24(7, K, local_index))), 0, (__local float *)(slm_brow + mad24(7, SLM_BLOCK, local_index)));
+        vstore8(vload8(0, src1_read0 + mad24(0, K, local_index)), 0, slm_brow + mad24(0, SLM_BLOCK, local_index));
+        vstore8(vload8(0, src1_read0 + mad24(1, K, local_index)), 0, slm_brow + mad24(1, SLM_BLOCK, local_index));
+        vstore8(vload8(0, src1_read0 + mad24(2, K, local_index)), 0, slm_brow + mad24(2, SLM_BLOCK, local_index));
+        vstore8(vload8(0, src1_read0 + mad24(3, K, local_index)), 0, slm_brow + mad24(3, SLM_BLOCK, local_index));
+        vstore8(vload8(0, src1_read0 + mad24(4, K, local_index)), 0, slm_brow + mad24(4, SLM_BLOCK, local_index));
+        vstore8(vload8(0, src1_read0 + mad24(5, K, local_index)), 0, slm_brow + mad24(5, SLM_BLOCK, local_index));
+        vstore8(vload8(0, src1_read0 + mad24(6, K, local_index)), 0, slm_brow + mad24(6, SLM_BLOCK, local_index));
+        vstore8(vload8(0, src1_read0 + mad24(7, K, local_index)), 0, slm_brow + mad24(7, SLM_BLOCK, local_index));
         barrier(CLK_LOCAL_MEM_FENCE);
 
         slm_brow0 = slm_brow + local_x * (TILE_K / 8);
@@ -469,17 +469,17 @@ __kernel void TEMPLATE(gemm_buffer_NT, Dtype)(
         while( w + TILE_K <= end_w ) {
             Dtype8 arow;
 
-            brow0 = as_half8(vload4(0, (__local float *)(slm_brow0 + 0 * SLM_BLOCK)));
-            brow1 = as_half8(vload4(0, (__local float *)(slm_brow0 + 1 * SLM_BLOCK)));
-            brow2 = as_half8(vload4(0, (__local float *)(slm_brow0 + 2 * SLM_BLOCK)));
-            brow3 = as_half8(vload4(0, (__local float *)(slm_brow0 + 3 * SLM_BLOCK)));
-            brow4 = as_half8(vload4(0, (__local float *)(slm_brow0 + 4 * SLM_BLOCK)));
-            brow5 = as_half8(vload4(0, (__local float *)(slm_brow0 + 5 * SLM_BLOCK)));
-            brow6 = as_half8(vload4(0, (__local float *)(slm_brow0 + 6 * SLM_BLOCK)));
-            brow7 = as_half8(vload4(0, (__local float *)(slm_brow0 + 7 * SLM_BLOCK)));
+            brow0 = vload8(0, slm_brow0 + 0 * SLM_BLOCK);
+            brow1 = vload8(0, slm_brow0 + 1 * SLM_BLOCK);
+            brow2 = vload8(0, slm_brow0 + 2 * SLM_BLOCK);
+            brow3 = vload8(0, slm_brow0 + 3 * SLM_BLOCK);
+            brow4 = vload8(0, slm_brow0 + 4 * SLM_BLOCK);
+            brow5 = vload8(0, slm_brow0 + 5 * SLM_BLOCK);
+            brow6 = vload8(0, slm_brow0 + 6 * SLM_BLOCK);
+            brow7 = vload8(0, slm_brow0 + 7 * SLM_BLOCK);
 
 #define MM_DOT_PRODUCT( _row, _dot )   \
-            arow = as_half8(vload4(0, (__global float *)(src0_read + _row * K)));                           \
+            arow = vload8(0, src0_read + _row * K); \
             _dot = mad( (Dtype8)(arow.s0), (Dtype8)(brow0.s0, brow1.s0, brow2.s0, brow3.s0, brow4.s0, brow5.s0, brow6.s0, brow7.s0), _dot ); \
             _dot = mad( (Dtype8)(arow.s1), (Dtype8)(brow0.s1, brow1.s1, brow2.s1, brow3.s1, brow4.s1, brow5.s1, brow6.s1, brow7.s1), _dot ); \
             _dot = mad( (Dtype8)(arow.s2), (Dtype8)(brow0.s2, brow1.s2, brow2.s2, brow3.s2, brow4.s2, brow5.s2, brow6.s2, brow7.s2), _dot ); \
@@ -510,7 +510,7 @@ __kernel void TEMPLATE(gemm_buffer_NT, Dtype)(
         Dtype8 arow;
 
 #define READ_BROW(_brow, _row) \
-        _brow = as_half8(vload4(0, (__local float *)(slm_brow0 + _row * SLM_BLOCK))); \
+        _brow = vload8(0, slm_brow0 + _row * SLM_BLOCK); \
         _brow.s0 = (mad24(local_x, 8, w) < K) ? _brow.s0 : 0.0f; \
         _brow.s1 = (mad24(local_x, 8, w + 1) < K) ? _brow.s1 : 0.0f; \
         _brow.s2 = (mad24(local_x, 8, w + 2) < K) ? _brow.s2 : 0.0f; \
@@ -532,7 +532,7 @@ __kernel void TEMPLATE(gemm_buffer_NT, Dtype)(
 #undef READ_BROW
 
 #define MM_DOT_PRODUCT( _row, _dot )   \
-        arow = as_half8(vload4(0, (__global float *)(src0_read + _row * K)));                           \
+        arow = vload8(0, src0_read + _row * K);                           \
         arow.s0 = (mad24(local_x, 8, w) < K) ? arow.s0 : 0.0f; \
         arow.s1 = (mad24(local_x, 8, w + 1) < K) ? arow.s1 : 0.0f; \
         arow.s2 = (mad24(local_x, 8, w + 2) < K) ? arow.s2 : 0.0f; \
diff --git a/modules/dnn/src/tengine4dnn/include/tengine_graph_convolution.hpp b/modules/dnn/src/tengine4dnn/include/tengine_graph_convolution.hpp
deleted file mode 100644
index 8ec99c9685..0000000000
--- a/modules/dnn/src/tengine4dnn/include/tengine_graph_convolution.hpp
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * Copyright (c) 2020, OPEN AI LAB
- * Author: qtang@openailab.com
- */
-
-#ifndef TENGINE_GRAPH_CONVOLUTION_HPP
-#define TENGINE_GRAPH_CONVOLUTION_HPP
-
-#define FLOAT_TO_REALSIZE (4)
-#ifdef HAVE_TENGINE
-
-#include "tengine_c_api.h"
-
-namespace cv
-{
-namespace dnn
-{
-// pad_h0: pad_top
-// pad_h1: pad_bottom
-// pad_w0: pad_left
-// pad_w1: pad_right
-teng_graph_t  tengine_init(const char* name , float* input_, int inch, int group, int in_h, int in_w,
-                        float *output_, int out_b, int outch, int out_h, int out_w,
-                        float *kernel_,int kernel_s , int kernel_h, int kernel_w,
-                        float *teg_bias, int stride_h, int stride_w,
-                        int pad_h0, int pad_h1, int pad_w0, int pad_w1, int dilation_h, int dilation_w,
-                        size_t wstep, const std::string padMode , teng_graph_t& graph, int nstripes) ;
-
-bool tengine_forward(teng_graph_t& graph) ;
-bool tengine_release(teng_graph_t& graph) ;
-}
-}
-#endif
-#endif /* TENGINE_GRAPH_CONVOLUTION_HPP */
\ No newline at end of file
diff --git a/modules/dnn/src/tengine4dnn/src/tengine_graph_convolution.cpp b/modules/dnn/src/tengine4dnn/src/tengine_graph_convolution.cpp
deleted file mode 100644
index d35937006c..0000000000
--- a/modules/dnn/src/tengine4dnn/src/tengine_graph_convolution.cpp
+++ /dev/null
@@ -1,370 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * Copyright (c) 2020, OPEN AI LAB
- * Author: qtang@openailab.com
- */
-
-#include "../../precomp.hpp"
-#include <iostream>
-#include <vector>
-
-#include <opencv2/core/utils/configuration.private.hpp>
-#include <opencv2/core/utils/logger.hpp>
-
-#include "../include/tengine_graph_convolution.hpp"
-
-#ifdef HAVE_TENGINE
-
-#include "tengine_c_api.h"
-
-
-namespace cv
-{
-namespace dnn
-{
-static int create_input_node(teng_graph_t graph, const char* node_name, int inch, int in_h, int in_w)
-{
-    node_t node     = teng_create_graph_node(graph, node_name, "InputOp");
-    tensor_t tensor = teng_create_graph_tensor(graph, node_name, TENGINE_DT_FP32);
-    teng_set_node_output_tensor(node, 0, tensor, TENSOR_TYPE_INPUT);
-
-    int dims[4] = {1, inch, in_h, in_w};
-    teng_set_tensor_shape(tensor, dims, 4);
-
-    teng_release_graph_tensor(tensor);
-    teng_release_graph_node(node);
-
-    return 0;
-}
-
-static int create_conv_node(teng_graph_t graph, const char* node_name, const char* input_name, int in_h, int in_w, int out_h, int out_w,
-    int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h0, int pad_h1, int pad_w0, int pad_w1, int inch, int outch, int group,
-    int dilation_h, int dilation_w, int activation, std::string padMode)
-{
-    node_t conv_node      = teng_create_graph_node(graph, node_name, "Convolution");
-    tensor_t input_tensor = teng_get_graph_tensor(graph, input_name);
-
-    if (input_tensor == NULL)
-    {
-        CV_LOG_WARNING(NULL,"Tengine: input_tensor is NULL." );
-        return -1;
-    }
-
-    teng_set_node_input_tensor(conv_node, 0, input_tensor);
-    teng_release_graph_tensor(input_tensor);
-
-    /* output */
-    tensor_t output_tensor = teng_create_graph_tensor(graph, node_name, TENGINE_DT_FP32);
-
-    teng_set_node_output_tensor(conv_node, 0, output_tensor, TENSOR_TYPE_VAR);
-    teng_release_graph_tensor(output_tensor);
-
-    /* weight */
-    std::string weight_name(node_name);
-    weight_name += "/weight";
-
-    node_t w_node = teng_create_graph_node(graph, weight_name.c_str(), "Const");
-    tensor_t w_tensor = teng_create_graph_tensor(graph, weight_name.c_str(), TENGINE_DT_FP32);
-    teng_set_node_output_tensor(w_node, 0, w_tensor, TENSOR_TYPE_CONST);
-    teng_set_node_input_tensor(conv_node, 1, w_tensor);
-    int w_dims[] = {outch, inch / group, kernel_h, kernel_w};
-
-    teng_set_tensor_shape(w_tensor, w_dims, 4);
-
-    teng_release_graph_node(w_node);
-    teng_release_graph_tensor(w_tensor);
-
-    /* bias */
-    std::string bias_name(node_name);
-    bias_name += "/bias";
-
-    node_t b_node = teng_create_graph_node(graph, bias_name.c_str(), "Const");
-    tensor_t b_tensor = teng_create_graph_tensor(graph, bias_name.c_str(), TENGINE_DT_FP32);
-    teng_set_node_output_tensor(b_node, 0, b_tensor, TENSOR_TYPE_CONST);
-    int b_dims[] = {outch};
-
-    teng_set_tensor_shape(b_tensor, b_dims, 1);
-
-    teng_set_node_input_tensor(conv_node, 2, b_tensor);
-    teng_release_graph_node(b_node);
-    teng_release_graph_tensor(b_tensor);
-
-    if (!padMode.empty())
-    {
-        if (padMode == "SAME")
-        {
-            int out_h_temp = (in_h-kernel_h + 2*pad_h0)/stride_h + 1;
-            int out_w_temp = (in_w-kernel_w + 2*pad_w0)/stride_w + 1;
-
-            if (out_h_temp < out_h)
-                pad_h1 += 1;
-            if (out_w_temp < out_w)
-                pad_w1 += 1;
-        }
-    }
-
-    /* attr */
-    teng_set_node_attr_int(conv_node, "kernel_h", &kernel_h);
-    teng_set_node_attr_int(conv_node, "kernel_w", &kernel_w);
-    teng_set_node_attr_int(conv_node, "stride_h", &stride_h);
-    teng_set_node_attr_int(conv_node, "stride_w", &stride_w);
-    teng_set_node_attr_int(conv_node, "pad_h0", &pad_h0);
-    teng_set_node_attr_int(conv_node, "pad_w0", &pad_w0);
-    teng_set_node_attr_int(conv_node, "pad_h1", &pad_h1);
-    teng_set_node_attr_int(conv_node, "pad_w1", &pad_w1);
-    teng_set_node_attr_int(conv_node, "output_channel", &outch);
-    teng_set_node_attr_int(conv_node, "input_channel", &inch);
-    teng_set_node_attr_int(conv_node, "group", &group);
-    teng_set_node_attr_int(conv_node, "dilation_h", &dilation_h);
-    teng_set_node_attr_int(conv_node, "dilation_w", &dilation_w);
-  //  set_node_attr_int(conv_node, "activation", &activation);
-
-    teng_release_graph_node(conv_node);
-
-    return 0;
-}
-
-static teng_graph_t create_conv_graph(const char* layer_name, float* input_data, int inch, int group, int in_h, int in_w,
-                        float* output_data, int outch, int out_h, int out_w,
-                        int kernel_h, int kernel_w,
-                        int stride_h,int stride_w,
-                        int pad_h0, int pad_h1, int pad_w0, int pad_w1, int dilation_h, int dilation_w, int activation,
-                        float* teg_weight, float* teg_bias, std::string padMode, int nstripes)
-{
-    node_t    conv_node     = NULL;
-
-    tensor_t  input_tensor  = NULL;
-    tensor_t  output_tensor = NULL;
-    tensor_t  weight_tensor = NULL;
-    tensor_t  bias_tensor   = NULL;
-
-    /* create graph for convolution */
-    int in_size  = in_h * in_w * inch;
-    int out_size  = out_h * out_w * outch;
-    int weight_size = outch * (inch / group) * kernel_w * kernel_h;
-    int bias_size = outch;
-
-    int buf_size  = 0;
-    int input_num = 0;
-
-    /* create graph */
-    teng_graph_t graph = teng_create_graph(NULL, NULL, NULL);
-    bool ok = true;
-
-    if(graph == NULL)
-    {
-        CV_LOG_WARNING(NULL,"Tengine: create_graph failed." );
-        ok = false;
-    }
-
-    const char* input_name = "data";
-    const char* conv_name  = layer_name;
-
-    if (ok && create_input_node(graph, input_name, inch, in_h, in_w) < 0)
-    {
-        CV_LOG_WARNING(NULL,"Tengine: create_input_node failed." );
-        ok = false;
-    }
-
-    if (ok && create_conv_node(graph, conv_name, input_name, in_h, in_w, out_h, out_w, kernel_h, kernel_w,
-        stride_h, stride_w, pad_h0, pad_h1, pad_w0, pad_w1, inch, outch, group, dilation_h, dilation_w, activation, padMode) < 0)
-    {
-        CV_LOG_WARNING(NULL,"Tengine: create conv node failed." );
-        ok = false;
-    }
-
-    /* set input/output node */
-    const char* inputs_name[]  = {input_name};
-    const char* outputs_name[] = {conv_name};
-
-    if (ok && teng_set_graph_input_node(graph, inputs_name, sizeof(inputs_name) / sizeof(char*)) < 0)
-    {
-        CV_LOG_WARNING(NULL,"Tengine: set inputs failed." );
-        ok = false;
-    }
-
-    if (ok && teng_set_graph_output_node(graph, outputs_name, sizeof(outputs_name) / sizeof(char*)) < 0)
-    {
-        CV_LOG_WARNING(NULL,"Tengine: set outputs failed." );
-        ok = false;
-    }
-
-    /* set input data */
-    if (ok)
-    {
-        input_tensor = teng_get_graph_input_tensor(graph, 0, 0);
-        buf_size     = teng_get_tensor_buffer_size(input_tensor);
-        if (buf_size != in_size * FLOAT_TO_REALSIZE)
-        {
-            CV_LOG_WARNING(NULL,"Tengine: Input data size check failed.");
-            ok = false;
-        }
-    }
-
-    if (ok)
-    {
-        teng_set_tensor_buffer(input_tensor, (float *)input_data, buf_size);
-        teng_release_graph_tensor(input_tensor);
-
-        /* create convolution node */
-        /* set weight node */
-        conv_node     = teng_get_graph_node(graph, conv_name);
-        weight_tensor = teng_get_node_input_tensor(conv_node, 1);
-        buf_size      = teng_get_tensor_buffer_size(weight_tensor);
-
-        if (buf_size != weight_size * FLOAT_TO_REALSIZE)
-        {
-            CV_LOG_WARNING(NULL,"Tengine: Input weight size check failed.");
-            ok = false;
-        }
-    }
-
-    if (ok)
-    {
-        teng_set_tensor_buffer(weight_tensor, teg_weight, buf_size);
-
-        /* set bias node */
-        input_num = teng_get_node_input_number(conv_node);
-        if (input_num > 2)
-        {
-            bias_tensor = teng_get_node_input_tensor(conv_node, 2);
-            buf_size    = teng_get_tensor_buffer_size(bias_tensor);
-            if (buf_size != bias_size * FLOAT_TO_REALSIZE)
-            {
-                CV_LOG_WARNING(NULL,"Tengine: Input bias size check failed.");
-                ok = false;
-            }
-            else teng_set_tensor_buffer(bias_tensor, teg_bias, buf_size);
-        }
-    }
-
-    /* prerun */
-    if (ok && teng_prerun_graph_multithread(graph, TENGINE_CLUSTER_BIG, nstripes) < 0)
-    {
-        CV_LOG_WARNING(NULL, "Tengine: prerun_graph failed.");
-        ok = false;
-    }
-
-    if (ok)
-    {
-        /* set output data */
-        output_tensor = teng_get_node_output_tensor(conv_node, 0);
-        int ret = teng_set_tensor_buffer(output_tensor, output_data, out_size * FLOAT_TO_REALSIZE);
-        if(ret)
-        {
-            CV_LOG_WARNING(NULL,"Tengine: Set output tensor buffer failed." );
-            ok = false;
-        }
-    }
-
-    if (false == ok)
-    {
-        teng_destroy_graph(graph) ;
-        return NULL ;
-    }
-    return graph;
-}
-static bool tengine_init_flag = false;
-teng_graph_t tengine_init(const char* layer_name, float* input_, int inch, int group, int in_h, int in_w,
-                        float *output_, int out_b, int outch, int out_h, int out_w,
-                        float *kernel_, int kernel_s ,int kernel_h, int kernel_w,
-                        float *teg_bias, int stride_h, int stride_w,
-                        int pad_h0, int pad_h1, int pad_w0, int pad_w1, int dilation_h, int dilation_w,
-                        size_t wstep, const std::string padMode, teng_graph_t &graph, int nstripes)
-{
-    std::vector<float> teg_weight_vec;
-    float *teg_weight = NULL;
-    int kernel_inwh = (inch / group) * kernel_w * kernel_h;
-    // Do not using the activation fuse mode, just convolution only.
-    int activation = -1;
-
-    if (!(kernel_s == 2 && kernel_h == kernel_w
-        && dilation_h == dilation_w && stride_h == stride_w
-        && out_b == 1 && pad_h0 < 10 && pad_h1 < 10 && pad_w0 < 10 && pad_w1 < 10)) // just for Conv2D
-    {
-       // printf("return : just for Conv2D\n");
-        return NULL;
-    }
-
-    {
-      /*   printf("Tengine(%s): input (1 x %d x %d x %d),output (%d x %d x %d x %d), kernel (%d x %d), stride (%d x %d), dilation (%d x %d), pad (%d x %d).\n",
-               layer_name, inch, in_h, in_w,
-               out_b, outch, out_h, out_w,
-               kernel_w, kernel_h,
-               stride_w, stride_h,
-               dilation_w, dilation_h,
-               pad_h0, pad_h1, pad_w0, pad_w1);
-     */
-        // weight
-        if (kernel_inwh != wstep)
-        {
-            teg_weight_vec.resize(kernel_inwh * outch);
-            teg_weight = &teg_weight_vec[0];
-            for (int i=0; i<outch; i++)
-            {
-                memcpy(teg_weight+i*kernel_inwh, kernel_+i*wstep, kernel_inwh*FLOAT_TO_REALSIZE);
-            }
-        }
-        else
-        {
-            teg_weight = kernel_;
-        }
-
-        /* initial the resource of tengine */
-        if(false == tengine_init_flag)
-        {
-            init_tengine();
-            tengine_init_flag = true;
-        }
-
-        /* create the convolution graph */
-        graph = create_conv_graph(layer_name, input_, inch, group, in_h, in_w,
-                                    output_, outch, out_h, out_w,
-                                    kernel_h, kernel_w, stride_h,stride_w,
-                                    pad_h0, pad_h1, pad_w0, pad_w1, dilation_h, dilation_w, activation,
-                                    teg_weight, teg_bias, padMode, nstripes);
-        if(NULL == graph )
-        {
-            return NULL;
-        }
-    }
-    return graph ;
-}
-
-bool tengine_forward(teng_graph_t &graph)
-{
-    /* run */
-    if(teng_run_graph(graph, 1) < 0)
-    {
-        CV_LOG_WARNING(NULL,"Tengine: run_graph failed.");
-        return false ;
-    }
-    return true;
-}
-bool tengine_release(teng_graph_t &graph)
-{
-    teng_postrun_graph(graph);
-    teng_destroy_graph(graph);
-    return true;
-}
-}
-}
-#endif
diff --git a/modules/dnn/test/test_backends.cpp b/modules/dnn/test/test_backends.cpp
index da666ace01..9570355b4f 100644
--- a/modules/dnn/test/test_backends.cpp
+++ b/modules/dnn/test/test_backends.cpp
@@ -194,7 +194,7 @@ TEST_P(DNNTestNetwork, MobileNet_SSD_Caffe)
     float scoreDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD || target == DNN_TARGET_CPU_FP16) ? 1.5e-2 : 0.0;
     float iouDiff = (target == DNN_TARGET_MYRIAD) ? 0.063  : 0.0;
     float detectionConfThresh = (target == DNN_TARGET_MYRIAD) ? 0.262  : FLT_MIN;
-         processNet("dnn/MobileNetSSD_deploy.caffemodel", "dnn/MobileNetSSD_deploy.prototxt",
+         processNet("dnn/MobileNetSSD_deploy_19e3ec3.caffemodel", "dnn/MobileNetSSD_deploy_19e3ec3.prototxt",
                     inp, "detection_out", "", scoreDiff, iouDiff, detectionConfThresh);
     expectNoFallbacksFromIE(net);
 }
@@ -237,7 +237,7 @@ TEST_P(DNNTestNetwork, MobileNet_SSD_Caffe_Different_Width_Height)
         scoreDiff = 0.03;
         iouDiff = 0.08;
     }
-    processNet("dnn/MobileNetSSD_deploy.caffemodel", "dnn/MobileNetSSD_deploy.prototxt",
+    processNet("dnn/MobileNetSSD_deploy_19e3ec3.caffemodel", "dnn/MobileNetSSD_deploy_19e3ec3.prototxt",
                 inp, "detection_out", "", scoreDiff, iouDiff);
     expectNoFallbacksFromIE(net);
 }
diff --git a/modules/dnn/test/test_caffe_importer.cpp b/modules/dnn/test/test_caffe_importer.cpp
index 708e353aac..66eff49979 100644
--- a/modules/dnn/test/test_caffe_importer.cpp
+++ b/modules/dnn/test/test_caffe_importer.cpp
@@ -290,8 +290,8 @@ TEST(Reproducibility_SSD, Accuracy)
 typedef testing::TestWithParam<tuple<Backend, Target> > Reproducibility_MobileNet_SSD;
 TEST_P(Reproducibility_MobileNet_SSD, Accuracy)
 {
-    const string proto = findDataFile("dnn/MobileNetSSD_deploy.prototxt", false);
-    const string model = findDataFile("dnn/MobileNetSSD_deploy.caffemodel", false);
+    const string proto = findDataFile("dnn/MobileNetSSD_deploy_19e3ec3.prototxt", false);
+    const string model = findDataFile("dnn/MobileNetSSD_deploy_19e3ec3.caffemodel", false);
     Net net = readNetFromCaffe(proto, model);
     int backendId = get<0>(GetParam());
     int targetId = get<1>(GetParam());
@@ -731,7 +731,7 @@ TEST_P(Test_Caffe_nets, FasterRCNN_vgg16)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
 #endif
 
-    double scoreDiff = 0.0, iouDiff = 0.0;
+    double scoreDiff = 0.001, iouDiff = 0.03;
 #if defined(INF_ENGINE_RELEASE)
     if (target == DNN_TARGET_MYRIAD)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
@@ -779,7 +779,7 @@ TEST_P(Test_Caffe_nets, FasterRCNN_zf)
                                            0, 7, 0.988779, 469.849, 75.1756, 718.64, 186.762,
                                            0, 12, 0.967198, 138.588, 206.843, 329.766, 553.176);
 
-    double scoreDiff = 0.0, iouDiff = 0.0;
+    double scoreDiff = 0.003, iouDiff = 0.07;
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) {
         scoreDiff = 0.02;
         iouDiff = 0.13;
diff --git a/modules/dnn/test/test_halide_layers.cpp b/modules/dnn/test/test_halide_layers.cpp
index d8a16d3efa..12e62c754a 100644
--- a/modules/dnn/test/test_halide_layers.cpp
+++ b/modules/dnn/test/test_halide_layers.cpp
@@ -407,15 +407,16 @@ INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, MaxPooling, Combine(
 ////////////////////////////////////////////////////////////////////////////////
 // Fully-connected
 ////////////////////////////////////////////////////////////////////////////////
-typedef TestWithParam<tuple<int, Size, int, bool, tuple<Backend, Target> > > FullyConnected;
+typedef TestWithParam<tuple<int, int, Size, int, bool, tuple<Backend, Target> > > FullyConnected;
 TEST_P(FullyConnected, Accuracy)
 {
-    int inChannels = get<0>(GetParam());
-    Size inSize = get<1>(GetParam());
-    int outChannels = get<2>(GetParam());
-    bool hasBias = get<3>(GetParam());
-    Backend backendId = get<0>(get<4>(GetParam()));
-    Target targetId = get<1>(get<4>(GetParam()));
+    int batch = get<0>(GetParam());
+    int inChannels = get<1>(GetParam());
+    Size inSize = get<2>(GetParam());
+    int outChannels = get<3>(GetParam());
+    bool hasBias = get<4>(GetParam());
+    Backend backendId = get<0>(get<5>(GetParam()));
+    Target targetId = get<1>(get<5>(GetParam()));
 #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2021040000)
     if ((backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 ||
          backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) && (targetId == DNN_TARGET_OPENCL_FP16 ||
@@ -424,6 +425,13 @@ TEST_P(FullyConnected, Accuracy)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X);
     }
 #endif
+    // https://github.com/openvinotoolkit/openvino/issues/19436
+    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && targetId == DNN_TARGET_OPENCL_FP16 && batch == 16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16);
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2023000000)
+    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && targetId == DNN_TARGET_OPENCL && batch == 16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL);
+#endif
 
     Mat weights(outChannels, inChannels * inSize.height * inSize.width, CV_32F);
     randu(weights, -1.0f, 1.0f);
@@ -439,7 +447,7 @@ TEST_P(FullyConnected, Accuracy)
     lp.type = "InnerProduct";
     lp.name = "testLayer";
 
-    int sz[] = {1, inChannels, inSize.height, inSize.width};
+    int sz[] = {batch, inChannels, inSize.height, inSize.width};
     Mat input(4, &sz[0], CV_32F);
 
     double l1 = 0.0;
@@ -453,11 +461,13 @@ TEST_P(FullyConnected, Accuracy)
     if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && targetId == DNN_TARGET_OPENCL_FP16)
     {
         l1 = 0.01;
+        if (INF_ENGINE_VER_MAJOR_GE(2023000000))
+            lInf = 0.016;
     }
     if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && targetId == DNN_TARGET_OPENCL)
     {
         l1 = 5e-3;
-        lInf = 7e-3;
+        lInf = INF_ENGINE_VER_MAJOR_GE(2023000000) ? 0.016 : 7e-3;
     }
 #endif
     if (targetId == DNN_TARGET_CUDA_FP16)
@@ -467,6 +477,7 @@ TEST_P(FullyConnected, Accuracy)
 }
 
 INSTANTIATE_TEST_CASE_P(Layer_Test_Halide, FullyConnected, Combine(
+/*batch*/        Values(1, 2, 4, 8, 16),
 /*in channels*/  Values(3, 4),
 /*in size*/      Values(Size(5, 4), Size(4, 5), Size(1, 1)),
 /*out channels*/ Values(3, 4),
diff --git a/modules/dnn/test/test_int8_layers.cpp b/modules/dnn/test/test_int8_layers.cpp
index 8b3cd01f29..caba112516 100644
--- a/modules/dnn/test/test_int8_layers.cpp
+++ b/modules/dnn/test/test_int8_layers.cpp
@@ -878,14 +878,14 @@ TEST_P(Test_Int8_nets, MobileNet_SSD)
     if (target == DNN_TARGET_OPENCL && !ocl::Device::getDefault().isIntel())
         applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL);
 
-    Net net = readNetFromCaffe(findDataFile("dnn/MobileNetSSD_deploy.prototxt", false),
-                               findDataFile("dnn/MobileNetSSD_deploy.caffemodel", false));
+    Net net = readNetFromCaffe(findDataFile("dnn/MobileNetSSD_deploy_19e3ec3.prototxt", false),
+                               findDataFile("dnn/MobileNetSSD_deploy_19e3ec3.caffemodel", false));
 
     Mat inp = imread(_tf("street.png"));
     Mat blob = blobFromImage(inp, 1.0 / 127.5, Size(300, 300), Scalar(127.5, 127.5, 127.5), false);
     Mat ref = blobFromNPY(_tf("mobilenet_ssd_caffe_out.npy"));
 
-    float confThreshold = FLT_MIN, scoreDiff = 0.059, iouDiff = 0.11;
+    float confThreshold = FLT_MIN, scoreDiff = 0.084, iouDiff = 0.43;
     testDetectionNet(net, blob, ref, confThreshold, scoreDiff, iouDiff);
 }
 
diff --git a/modules/dnn/test/test_misc.cpp b/modules/dnn/test/test_misc.cpp
index 4ee3e013cb..0c5fb28c5d 100644
--- a/modules/dnn/test/test_misc.cpp
+++ b/modules/dnn/test/test_misc.cpp
@@ -120,6 +120,28 @@ TEST(blobFromImageWithParams_4ch, letter_box)
     EXPECT_EQ(0, cvtest::norm(targetBlob, blob, NORM_INF));
 }
 
+TEST(blobFromImagesWithParams_4ch, multi_image)
+{
+    Mat img(10, 10, CV_8UC4, cv::Scalar(0, 1, 2, 3));
+    Scalar scalefactor(0.1, 0.2, 0.3, 0.4);
+
+    Image2BlobParams param;
+    param.scalefactor = scalefactor;
+    param.datalayout = DNN_LAYOUT_NHWC;
+
+    Mat blobs = blobFromImagesWithParams(std::vector<Mat> { img, 2*img }, param);
+    vector<Range> ranges;
+    ranges.push_back(Range(0, 1));
+    ranges.push_back(Range(0, blobs.size[1]));
+    ranges.push_back(Range(0, blobs.size[2]));
+    ranges.push_back(Range(0, blobs.size[3]));
+    Mat blob0 = blobs(ranges);
+    ranges[0] = Range(1, 2);
+    Mat blob1 = blobs(ranges);
+
+    EXPECT_EQ(0, cvtest::norm(2*blob0, blob1, NORM_INF));
+}
+
 TEST(readNet, Regression)
 {
     Net net = readNet(findDataFile("dnn/squeezenet_v1.1.prototxt"),
diff --git a/modules/dnn/test/test_model.cpp b/modules/dnn/test/test_model.cpp
index a19923bf28..59b51c4bc0 100644
--- a/modules/dnn/test/test_model.cpp
+++ b/modules/dnn/test/test_model.cpp
@@ -490,8 +490,8 @@ TEST_P(Test_Model, DetectionMobilenetSSD)
         refBoxes.emplace_back(left, top, width, height);
     }
 
-    std::string weights_file = _tf("MobileNetSSD_deploy.caffemodel", false);
-    std::string config_file = _tf("MobileNetSSD_deploy.prototxt");
+    std::string weights_file = _tf("MobileNetSSD_deploy_19e3ec3.caffemodel", false);
+    std::string config_file = _tf("MobileNetSSD_deploy_19e3ec3.prototxt");
 
     Scalar mean = Scalar(127.5, 127.5, 127.5);
     double scale = 1.0 / 127.5;
@@ -511,7 +511,7 @@ TEST_P(Test_Model, DetectionMobilenetSSD)
     }
     else if (target == DNN_TARGET_CUDA_FP16)
     {
-        scoreDiff = 0.0021;
+        scoreDiff = 0.0028;
         iouDiff = 1e-2;
     }
     float confThreshold = FLT_MIN;
@@ -595,8 +595,8 @@ TEST_P(Test_Model, Detection_normalized)
     std::vector<float> refConfidences = {0.999222f};
     std::vector<Rect2d> refBoxes = {Rect2d(0, 4, 227, 222)};
 
-    std::string weights_file = _tf("MobileNetSSD_deploy.caffemodel", false);
-    std::string config_file = _tf("MobileNetSSD_deploy.prototxt");
+    std::string weights_file = _tf("MobileNetSSD_deploy_19e3ec3.caffemodel", false);
+    std::string config_file = _tf("MobileNetSSD_deploy_19e3ec3.prototxt");
 
     Scalar mean = Scalar(127.5, 127.5, 127.5);
     double scale = 1.0 / 127.5;
diff --git a/modules/dnn/test/test_tflite_importer.cpp b/modules/dnn/test/test_tflite_importer.cpp
index 19b3f3a94a..4f3a8b4a96 100644
--- a/modules/dnn/test/test_tflite_importer.cpp
+++ b/modules/dnn/test/test_tflite_importer.cpp
@@ -128,6 +128,11 @@ TEST_P(Test_TFLite, max_unpooling)
     if (backend == DNN_BACKEND_CUDA)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA);
 
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2022010000)
+        if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+            applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+#endif
+
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target != DNN_TARGET_CPU) {
         if (target == DNN_TARGET_OPENCL_FP16) applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
         if (target == DNN_TARGET_OPENCL)      applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
@@ -152,14 +157,7 @@ TEST_P(Test_TFLite, max_unpooling)
     net.setInput(input);
 
     std::vector<std::vector<Mat> > outs;
-    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) {
-        // TODO: seems like a bug with a retrieving intermediate tensors
-        net.forward(outs, {"conv2d_transpose_4", "p_re_lu_1", "max_pooling_with_argmax2d", "conv2d_86", "max_unpooling2d_2"});
-        outs.erase(outs.begin());
-    }
-    else {
-        net.forward(outs, {"p_re_lu_1", "max_pooling_with_argmax2d", "conv2d_86", "max_unpooling2d_2"});
-    }
+    net.forward(outs, {"p_re_lu_1", "max_pooling_with_argmax2d", "conv2d_86", "max_unpooling2d_2"});
 
     ASSERT_EQ(outs.size(), 4);
     ASSERT_EQ(outs[0].size(), 1);
diff --git a/modules/features2d/3rdparty/mscr/chi_table.h b/modules/features2d/3rdparty/mscr/chi_table.h
new file mode 100644
index 0000000000..c0e9bae046
--- /dev/null
+++ b/modules/features2d/3rdparty/mscr/chi_table.h
@@ -0,0 +1,135 @@
+/*
+**
+**                           License Agreement
+**                           For chi_table.h
+**
+** Copyright (C) 2007 Per-Erik Forssen, all rights reserved.
+**
+** Redistribution and use in source and binary forms, with or without modification,
+** are permitted provided that the following conditions are met:
+**
+**   * Redistribution's of source code must retain the above copyright notice,
+**     this list of conditions and the following disclaimer.
+**
+**   * Redistribution's in binary form must reproduce the above copyright notice,
+**     this list of conditions and the following disclaimer in the documentation
+**     and/or other materials provided with the distribution.
+**
+**   * The name of the copyright holders may not be used to endorse or promote products
+**     derived from this software without specific prior written permission.
+**
+** This software is provided by the copyright holders and contributors "as is" and
+** any express or implied warranties, including, but not limited to, the implied
+** warranties of merchantability and fitness for a particular purpose are disclaimed.
+** In no event shall the Intel Corporation or contributors be liable for any direct,
+** indirect, incidental, special, exemplary, or consequential damages
+** (including, but not limited to, procurement of substitute goods or services;
+** loss of use, data, or profits; or business interruption) however caused
+** and on any theory of liability, whether in contract, strict liability,
+** or tort (including negligence or otherwise) arising in any way out of
+** the use of this software, even if advised of the possibility of such damage.
+**
+** Content origin: http://users.isy.liu.se/cvl/perfo/software/chi_table.h
+*/
+#define TABLE_SIZE 400
+
+static double chitab3[]={0,  0.0150057,  0.0239478,  0.0315227,
+                  0.0383427,  0.0446605,  0.0506115,  0.0562786,
+                  0.0617174,  0.0669672,  0.0720573,  0.0770099,
+                  0.081843,  0.0865705,  0.0912043,  0.0957541,
+                  0.100228,  0.104633,  0.108976,  0.113261,
+                  0.117493,  0.121676,  0.125814,  0.12991,
+                  0.133967,  0.137987,  0.141974,  0.145929,
+                  0.149853,  0.15375,  0.15762,  0.161466,
+                  0.165287,  0.169087,  0.172866,  0.176625,
+                  0.180365,  0.184088,  0.187794,  0.191483,
+                  0.195158,  0.198819,  0.202466,  0.2061,
+                  0.209722,  0.213332,  0.216932,  0.220521,
+                  0.2241,  0.22767,  0.231231,  0.234783,
+                  0.238328,  0.241865,  0.245395,  0.248918,
+                  0.252435,  0.255947,  0.259452,  0.262952,
+                  0.266448,  0.269939,  0.273425,  0.276908,
+                  0.280386,  0.283862,  0.287334,  0.290803,
+                  0.29427,  0.297734,  0.301197,  0.304657,
+                  0.308115,  0.311573,  0.315028,  0.318483,
+                  0.321937,  0.32539,  0.328843,  0.332296,
+                  0.335749,  0.339201,  0.342654,  0.346108,
+                  0.349562,  0.353017,  0.356473,  0.35993,
+                  0.363389,  0.366849,  0.37031,  0.373774,
+                  0.377239,  0.380706,  0.384176,  0.387648,
+                  0.391123,  0.3946,  0.39808,  0.401563,
+                  0.405049,  0.408539,  0.412032,  0.415528,
+                  0.419028,  0.422531,  0.426039,  0.429551,
+                  0.433066,  0.436586,  0.440111,  0.44364,
+                  0.447173,  0.450712,  0.454255,  0.457803,
+                  0.461356,  0.464915,  0.468479,  0.472049,
+                  0.475624,  0.479205,  0.482792,  0.486384,
+                  0.489983,  0.493588,  0.4972,  0.500818,
+                  0.504442,  0.508073,  0.511711,  0.515356,
+                  0.519008,  0.522667,  0.526334,  0.530008,
+                  0.533689,  0.537378,  0.541075,  0.54478,
+                  0.548492,  0.552213,  0.555942,  0.55968,
+                  0.563425,  0.56718,  0.570943,  0.574715,
+                  0.578497,  0.582287,  0.586086,  0.589895,
+                  0.593713,  0.597541,  0.601379,  0.605227,
+                  0.609084,  0.612952,  0.61683,  0.620718,
+                  0.624617,  0.628526,  0.632447,  0.636378,
+                  0.64032,  0.644274,  0.648239,  0.652215,
+                  0.656203,  0.660203,  0.664215,  0.668238,
+                  0.672274,  0.676323,  0.680384,  0.684457,
+                  0.688543,  0.692643,  0.696755,  0.700881,
+                  0.70502,  0.709172,  0.713339,  0.717519,
+                  0.721714,  0.725922,  0.730145,  0.734383,
+                  0.738636,  0.742903,  0.747185,  0.751483,
+                  0.755796,  0.760125,  0.76447,  0.768831,
+                  0.773208,  0.777601,  0.782011,  0.786438,
+                  0.790882,  0.795343,  0.799821,  0.804318,
+                  0.808831,  0.813363,  0.817913,  0.822482,
+                  0.827069,  0.831676,  0.836301,  0.840946,
+                  0.84561,  0.850295,  0.854999,  0.859724,
+                  0.864469,  0.869235,  0.874022,  0.878831,
+                  0.883661,  0.888513,  0.893387,  0.898284,
+                  0.903204,  0.908146,  0.913112,  0.918101,
+                  0.923114,  0.928152,  0.933214,  0.938301,
+                  0.943413,  0.94855,  0.953713,  0.958903,
+                  0.964119,  0.969361,  0.974631,  0.979929,
+                  0.985254,  0.990608,  0.99599,  1.0014,
+                  1.00684,  1.01231,  1.01781,  1.02335,
+                  1.02891,  1.0345,  1.04013,  1.04579,
+                  1.05148,  1.05721,  1.06296,  1.06876,
+                  1.07459,  1.08045,  1.08635,  1.09228,
+                  1.09826,  1.10427,  1.11032,  1.1164,
+                  1.12253,  1.1287,  1.1349,  1.14115,
+                  1.14744,  1.15377,  1.16015,  1.16656,
+                  1.17303,  1.17954,  1.18609,  1.19269,
+                  1.19934,  1.20603,  1.21278,  1.21958,
+                  1.22642,  1.23332,  1.24027,  1.24727,
+                  1.25433,  1.26144,  1.26861,  1.27584,
+                  1.28312,  1.29047,  1.29787,  1.30534,
+                  1.31287,  1.32046,  1.32812,  1.33585,
+                  1.34364,  1.3515,  1.35943,  1.36744,
+                  1.37551,  1.38367,  1.39189,  1.4002,
+                  1.40859,  1.41705,  1.42561,  1.43424,
+                  1.44296,  1.45177,  1.46068,  1.46967,
+                  1.47876,  1.48795,  1.49723,  1.50662,
+                  1.51611,  1.52571,  1.53541,  1.54523,
+                  1.55517,  1.56522,  1.57539,  1.58568,
+                  1.59611,  1.60666,  1.61735,  1.62817,
+                  1.63914,  1.65025,  1.66152,  1.67293,
+                  1.68451,  1.69625,  1.70815,  1.72023,
+                  1.73249,  1.74494,  1.75757,  1.77041,
+                  1.78344,  1.79669,  1.81016,  1.82385,
+                  1.83777,  1.85194,  1.86635,  1.88103,
+                  1.89598,  1.91121,  1.92674,  1.94257,
+                  1.95871,  1.97519,  1.99201,  2.0092,
+                  2.02676,  2.04471,  2.06309,  2.08189,
+                  2.10115,  2.12089,  2.14114,  2.16192,
+                  2.18326,  2.2052,  2.22777,  2.25101,
+                  2.27496,  2.29966,  2.32518,  2.35156,
+                  2.37886,  2.40717,  2.43655,  2.46709,
+                  2.49889,  2.53206,  2.56673,  2.60305,
+                  2.64117,  2.6813,  2.72367,  2.76854,
+                  2.81623,  2.86714,  2.92173,  2.98059,
+                  3.04446,  3.1143,  3.19135,  3.27731,
+                  3.37455,  3.48653,  3.61862,  3.77982,
+                  3.98692,  4.2776,  4.77167,  133.333 };
diff --git a/modules/features2d/3rdparty/mscr/chi_table_LICENSE.txt b/modules/features2d/3rdparty/mscr/chi_table_LICENSE.txt
new file mode 100644
index 0000000000..66b272dd2d
--- /dev/null
+++ b/modules/features2d/3rdparty/mscr/chi_table_LICENSE.txt
@@ -0,0 +1,28 @@
+                          License Agreement
+                          For chi_table.h
+
+Copyright (C) 2007 Per-Erik Forssen, all rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+  * Redistribution's of source code must retain the above copyright notice,
+    this list of conditions and the following disclaimer.
+
+  * Redistribution's in binary form must reproduce the above copyright notice,
+    this list of conditions and the following disclaimer in the documentation
+    and/or other materials provided with the distribution.
+
+  * The name of the copyright holders may not be used to endorse or promote products
+    derived from this software without specific prior written permission.
+
+This software is provided by the copyright holders and contributors "as is" and
+any express or implied warranties, including, but not limited to, the implied
+warranties of merchantability and fitness for a particular purpose are disclaimed.
+In no event shall the Intel Corporation or contributors be liable for any direct,
+indirect, incidental, special, exemplary, or consequential damages
+(including, but not limited to, procurement of substitute goods or services;
+loss of use, data, or profits; or business interruption) however caused
+and on any theory of liability, whether in contract, strict liability,
+or tort (including negligence or otherwise) arising in any way out of
+the use of this software, even if advised of the possibility of such damage.
diff --git a/modules/features2d/CMakeLists.txt b/modules/features2d/CMakeLists.txt
index a586d4606e..91fea8bcc8 100644
--- a/modules/features2d/CMakeLists.txt
+++ b/modules/features2d/CMakeLists.txt
@@ -7,3 +7,5 @@ if(DEBUG_opencv_features2d)
   list(APPEND debug_modules opencv_highgui)
 endif()
 ocv_define_module(features2d opencv_imgproc ${debug_modules} OPTIONAL opencv_flann WRAP java objc python js)
+
+ocv_install_3rdparty_licenses(mscr "${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/mscr/chi_table_LICENSE.txt")
diff --git a/modules/features2d/src/mser.cpp b/modules/features2d/src/mser.cpp
index 39bcbf6938..3cada4ec75 100644
--- a/modules/features2d/src/mser.cpp
+++ b/modules/features2d/src/mser.cpp
@@ -30,18 +30,23 @@
  * OpenCV functions for MSER extraction
  *
  * 1. there are two different implementation of MSER, one for gray image, one for color image
- * 2. the gray image algorithm is taken from: Linear Time Maximally Stable Extremal Regions;
+ * 2. the gray image algorithm is taken from:
+ *      Linear Time Maximally Stable Extremal Regions;
  *    the paper claims to be faster than union-find method;
  *    it actually get 1.5~2m/s on my centrino L7200 1.2GHz laptop.
- * 3. the color image algorithm is taken from: Maximally Stable Colour Regions for Recognition and Match;
+ * 3. the color image algorithm is taken from:
+ *      Maximally Stable Colour Regions for Recognition and Match;
  *    it should be much slower than gray image method ( 3~4 times );
- *    the chi_table.h file is taken directly from paper's source code which is distributed under permissive BSD-like license: http://users.isy.liu.se/cvl/perfo/software/chi_table.h
+ *    the chi_table.h file is taken directly from the paper's source code:
+ *    http://users.isy.liu.se/cvl/perfo/software/chi_table.h
+ *    license (BSD-like) is located in the file: 3rdparty/mscr/chi_table_LICENSE.txt
  * 4. though the name is *contours*, the result actually is a list of point set.
  */
 
 #include "precomp.hpp"
 #include "opencv2/imgproc/imgproc_c.h"
 #include <limits>
+#include "../3rdparty/mscr/chi_table.h"
 
 namespace cv
 {
@@ -613,113 +618,6 @@ the color MSER has not been completely refactored yet. We leave it mostly as-is,
 with just enough changes to convert C structures to C++ ones and
 add support for color images into MSER_Impl::detectAndLabel.
 */
-
-const int TABLE_SIZE = 400;
-
-static const float chitab3[]=
-{
-    0.f,  0.0150057f,  0.0239478f,  0.0315227f,
-    0.0383427f,  0.0446605f,  0.0506115f,  0.0562786f,
-    0.0617174f,  0.0669672f,  0.0720573f,  0.0770099f,
-    0.081843f,  0.0865705f,  0.0912043f,  0.0957541f,
-    0.100228f,  0.104633f,  0.108976f,  0.113261f,
-    0.117493f,  0.121676f,  0.125814f,  0.12991f,
-    0.133967f,  0.137987f,  0.141974f,  0.145929f,
-    0.149853f,  0.15375f,  0.15762f,  0.161466f,
-    0.165287f,  0.169087f,  0.172866f,  0.176625f,
-    0.180365f,  0.184088f,  0.187794f,  0.191483f,
-    0.195158f,  0.198819f,  0.202466f,  0.2061f,
-    0.209722f,  0.213332f,  0.216932f,  0.220521f,
-    0.2241f,  0.22767f,  0.231231f,  0.234783f,
-    0.238328f,  0.241865f,  0.245395f,  0.248918f,
-    0.252435f,  0.255947f,  0.259452f,  0.262952f,
-    0.266448f,  0.269939f,  0.273425f,  0.276908f,
-    0.280386f,  0.283862f,  0.287334f,  0.290803f,
-    0.29427f,  0.297734f,  0.301197f,  0.304657f,
-    0.308115f,  0.311573f,  0.315028f,  0.318483f,
-    0.321937f,  0.32539f,  0.328843f,  0.332296f,
-    0.335749f,  0.339201f,  0.342654f,  0.346108f,
-    0.349562f,  0.353017f,  0.356473f,  0.35993f,
-    0.363389f,  0.366849f,  0.37031f,  0.373774f,
-    0.377239f,  0.380706f,  0.384176f,  0.387648f,
-    0.391123f,  0.3946f,  0.39808f,  0.401563f,
-    0.405049f,  0.408539f,  0.412032f,  0.415528f,
-    0.419028f,  0.422531f,  0.426039f,  0.429551f,
-    0.433066f,  0.436586f,  0.440111f,  0.44364f,
-    0.447173f,  0.450712f,  0.454255f,  0.457803f,
-    0.461356f,  0.464915f,  0.468479f,  0.472049f,
-    0.475624f,  0.479205f,  0.482792f,  0.486384f,
-    0.489983f,  0.493588f,  0.4972f,  0.500818f,
-    0.504442f,  0.508073f,  0.511711f,  0.515356f,
-    0.519008f,  0.522667f,  0.526334f,  0.530008f,
-    0.533689f,  0.537378f,  0.541075f,  0.54478f,
-    0.548492f,  0.552213f,  0.555942f,  0.55968f,
-    0.563425f,  0.56718f,  0.570943f,  0.574715f,
-    0.578497f,  0.582287f,  0.586086f,  0.589895f,
-    0.593713f,  0.597541f,  0.601379f,  0.605227f,
-    0.609084f,  0.612952f,  0.61683f,  0.620718f,
-    0.624617f,  0.628526f,  0.632447f,  0.636378f,
-    0.64032f,  0.644274f,  0.648239f,  0.652215f,
-    0.656203f,  0.660203f,  0.664215f,  0.668238f,
-    0.672274f,  0.676323f,  0.680384f,  0.684457f,
-    0.688543f,  0.692643f,  0.696755f,  0.700881f,
-    0.70502f,  0.709172f,  0.713339f,  0.717519f,
-    0.721714f,  0.725922f,  0.730145f,  0.734383f,
-    0.738636f,  0.742903f,  0.747185f,  0.751483f,
-    0.755796f,  0.760125f,  0.76447f,  0.768831f,
-    0.773208f,  0.777601f,  0.782011f,  0.786438f,
-    0.790882f,  0.795343f,  0.799821f,  0.804318f,
-    0.808831f,  0.813363f,  0.817913f,  0.822482f,
-    0.827069f,  0.831676f,  0.836301f,  0.840946f,
-    0.84561f,  0.850295f,  0.854999f,  0.859724f,
-    0.864469f,  0.869235f,  0.874022f,  0.878831f,
-    0.883661f,  0.888513f,  0.893387f,  0.898284f,
-    0.903204f,  0.908146f,  0.913112f,  0.918101f,
-    0.923114f,  0.928152f,  0.933214f,  0.938301f,
-    0.943413f,  0.94855f,  0.953713f,  0.958903f,
-    0.964119f,  0.969361f,  0.974631f,  0.979929f,
-    0.985254f,  0.990608f,  0.99599f,  1.0014f,
-    1.00684f,  1.01231f,  1.01781f,  1.02335f,
-    1.02891f,  1.0345f,  1.04013f,  1.04579f,
-    1.05148f,  1.05721f,  1.06296f,  1.06876f,
-    1.07459f,  1.08045f,  1.08635f,  1.09228f,
-    1.09826f,  1.10427f,  1.11032f,  1.1164f,
-    1.12253f,  1.1287f,  1.1349f,  1.14115f,
-    1.14744f,  1.15377f,  1.16015f,  1.16656f,
-    1.17303f,  1.17954f,  1.18609f,  1.19269f,
-    1.19934f,  1.20603f,  1.21278f,  1.21958f,
-    1.22642f,  1.23332f,  1.24027f,  1.24727f,
-    1.25433f,  1.26144f,  1.26861f,  1.27584f,
-    1.28312f,  1.29047f,  1.29787f,  1.30534f,
-    1.31287f,  1.32046f,  1.32812f,  1.33585f,
-    1.34364f,  1.3515f,  1.35943f,  1.36744f,
-    1.37551f,  1.38367f,  1.39189f,  1.4002f,
-    1.40859f,  1.41705f,  1.42561f,  1.43424f,
-    1.44296f,  1.45177f,  1.46068f,  1.46967f,
-    1.47876f,  1.48795f,  1.49723f,  1.50662f,
-    1.51611f,  1.52571f,  1.53541f,  1.54523f,
-    1.55517f,  1.56522f,  1.57539f,  1.58568f,
-    1.59611f,  1.60666f,  1.61735f,  1.62817f,
-    1.63914f,  1.65025f,  1.66152f,  1.67293f,
-    1.68451f,  1.69625f,  1.70815f,  1.72023f,
-    1.73249f,  1.74494f,  1.75757f,  1.77041f,
-    1.78344f,  1.79669f,  1.81016f,  1.82385f,
-    1.83777f,  1.85194f,  1.86635f,  1.88103f,
-    1.89598f,  1.91121f,  1.92674f,  1.94257f,
-    1.95871f,  1.97519f,  1.99201f,  2.0092f,
-    2.02676f,  2.04471f,  2.06309f,  2.08189f,
-    2.10115f,  2.12089f,  2.14114f,  2.16192f,
-    2.18326f,  2.2052f,  2.22777f,  2.25101f,
-    2.27496f,  2.29966f,  2.32518f,  2.35156f,
-    2.37886f,  2.40717f,  2.43655f,  2.46709f,
-    2.49889f,  2.53206f,  2.56673f,  2.60305f,
-    2.64117f,  2.6813f,  2.72367f,  2.76854f,
-    2.81623f,  2.86714f,  2.92173f,  2.98059f,
-    3.04446f,  3.1143f,  3.19135f,  3.27731f,
-    3.37455f,  3.48653f,  3.61862f,  3.77982f,
-    3.98692f,  4.2776f,  4.77167f,  133.333f
-};
-
 struct MSCRNode;
 
 struct TempMSCR
diff --git a/modules/features2d/test/test_descriptors_regression.cpp b/modules/features2d/test/test_descriptors_regression.cpp
index 0258fea0f3..e44edb0769 100644
--- a/modules/features2d/test/test_descriptors_regression.cpp
+++ b/modules/features2d/test/test_descriptors_regression.cpp
@@ -142,7 +142,7 @@ TEST_P(DescriptorImage, no_crash)
 {
     vector<String> fnames;
     glob(cvtest::TS::ptr()->get_data_path() + pattern, fnames, false);
-    sort(fnames.begin(), fnames.end());
+    std::sort(fnames.begin(), fnames.end());
 
     Ptr<AKAZE> akaze_mldb = AKAZE::create(AKAZE::DESCRIPTOR_MLDB);
     Ptr<AKAZE> akaze_mldb_upright = AKAZE::create(AKAZE::DESCRIPTOR_MLDB_UPRIGHT);
diff --git a/modules/gapi/CMakeLists.txt b/modules/gapi/CMakeLists.txt
index 9ecbb6d514..46ea208221 100644
--- a/modules/gapi/CMakeLists.txt
+++ b/modules/gapi/CMakeLists.txt
@@ -190,6 +190,9 @@ set(gapi_srcs
     src/backends/ov/bindings_ov.cpp
     src/backends/python/gpythonbackend.cpp
 
+    # Queue Streaming source
+    src/streaming/queue_source.cpp
+
     # OpenVPL Streaming source
     src/streaming/onevpl/source.cpp
     src/streaming/onevpl/source_priv.cpp
diff --git a/modules/gapi/cmake/DownloadADE.cmake b/modules/gapi/cmake/DownloadADE.cmake
index e22c4f1a32..26407f4fef 100644
--- a/modules/gapi/cmake/DownloadADE.cmake
+++ b/modules/gapi/cmake/DownloadADE.cmake
@@ -1,7 +1,7 @@
 set(ade_src_dir "${OpenCV_BINARY_DIR}/3rdparty/ade")
-set(ade_filename "v0.1.2a.zip")
-set(ade_subdir "ade-0.1.2a")
-set(ade_md5 "fa4b3e25167319cb0fa9432ef8281945")
+set(ade_filename "v0.1.2b.zip")
+set(ade_subdir "ade-0.1.2b")
+set(ade_md5 "4f93a0844dfc463c617d83b09011819a")
 ocv_download(FILENAME ${ade_filename}
              HASH ${ade_md5}
              URL
diff --git a/modules/gapi/include/opencv2/gapi/gtype_traits.hpp b/modules/gapi/include/opencv2/gapi/gtype_traits.hpp
index b56175788f..a1703a52cb 100644
--- a/modules/gapi/include/opencv2/gapi/gtype_traits.hpp
+++ b/modules/gapi/include/opencv2/gapi/gtype_traits.hpp
@@ -141,8 +141,10 @@ namespace detail
     template<typename U> struct GTypeOf<std::vector<U> >       { using type = cv::GArray<U>; };
     template<typename U> struct GTypeOf                        { using type = cv::GOpaque<U>;};
     template<>           struct GTypeOf<cv::MediaFrame>        { using type = cv::GFrame;    };
-    // FIXME: This is not quite correct since IStreamSource may produce not only Mat but also Scalar
-    // and vector data. TODO: Extend the type dispatching on these types too.
+
+    // FIXME: This is not quite correct since IStreamSource may
+    // produce not only Mat but also MediaFrame, Scalar and vector
+    // data. TODO: Extend the type dispatching on these types too.
     template<>           struct GTypeOf<cv::gapi::wip::IStreamSource::Ptr> { using type = cv::GMat;};
     template<class T> using g_type_of_t = typename GTypeOf<T>::type;
 
diff --git a/modules/gapi/include/opencv2/gapi/streaming/queue_source.hpp b/modules/gapi/include/opencv2/gapi/streaming/queue_source.hpp
new file mode 100644
index 0000000000..bd385ed16e
--- /dev/null
+++ b/modules/gapi/include/opencv2/gapi/streaming/queue_source.hpp
@@ -0,0 +1,67 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2023 Intel Corporation
+
+#ifndef OPENCV_GAPI_STREAMING_QUEUE_SOURCE_HPP
+#define OPENCV_GAPI_STREAMING_QUEUE_SOURCE_HPP
+
+#include <memory>                      // shared_ptr
+#include <type_traits>                 // is_base_of
+
+#include <opencv2/gapi/garg.hpp>       // GRunArgs
+#include <opencv2/gapi/gmetaarg.hpp>   // GMetaArg + all descr_of
+#include <opencv2/gapi/streaming/source.hpp> // IStreamSource
+
+namespace cv {
+namespace gapi {
+namespace wip {
+struct Data; // fwd-declare to avoid circular? header dependencies
+
+class GAPI_EXPORTS QueueSourceBase: public cv::gapi::wip::IStreamSource {
+    class Priv;
+    std::shared_ptr<Priv> m_priv;
+    // FIXME: Need to understand how it works with IStreamSource's shared_from_this
+    // Can we avoid having too many shared_ptrs here?
+
+public:
+    explicit QueueSourceBase(const cv::GMetaArg &m);
+    void push(Data &&data);
+    virtual bool pull(Data &data) override;
+    virtual void halt() override;
+    virtual GMetaArg descr_of() const override;
+    virtual ~QueueSourceBase() = default;
+};
+
+/**
+ * @brief Queued streaming pipeline source.
+ *
+ */
+template<class T>
+class QueueSource final: public QueueSourceBase
+{
+public:
+    using Meta = decltype(cv::descr_of(T{}));
+    explicit QueueSource(Meta m) : QueueSourceBase(GMetaArg{m}) {
+    }
+    void push(T t) {
+        QueueSourceBase::push(Data{t});
+    }
+};
+
+class GAPI_EXPORTS QueueInput {
+    std::vector<std::shared_ptr<QueueSourceBase> > m_sources;
+
+public:
+    explicit QueueInput(const cv::GMetaArgs &args);
+
+    void push(cv::GRunArgs &&ins);
+    operator cv::GRunArgs();
+};
+
+} // namespace wip
+} // namespace gapi
+} // namespace cv
+
+#endif // OPENCV_GAPI_STREAMING_SOURCE_HPP
diff --git a/modules/gapi/include/opencv2/gapi/streaming/source.hpp b/modules/gapi/include/opencv2/gapi/streaming/source.hpp
index 6597cad8f8..267469ad1b 100644
--- a/modules/gapi/include/opencv2/gapi/streaming/source.hpp
+++ b/modules/gapi/include/opencv2/gapi/streaming/source.hpp
@@ -16,7 +16,7 @@
 namespace cv {
 namespace gapi {
 namespace wip {
-    struct Data; // "forward-declaration" of GRunArg
+struct Data; // forward-declaration of Data to avoid circular dependencies
 
 /**
  * @brief Abstract streaming pipeline source.
@@ -43,6 +43,11 @@ public:
     Ptr ptr() { return shared_from_this(); }
     virtual bool pull(Data &data) = 0;
     virtual GMetaArg descr_of() const = 0;
+    virtual void halt() {
+        // Do nothing by default to maintain compatibility with the existing sources...
+        // In fact needs to be decorated atop of the child classes to maintain the behavior
+        // FIXME: Make it mandatory in OpenCV 5.0
+    };
     virtual ~IStreamSource() = default;
 };
 
diff --git a/modules/gapi/misc/python/test/test_gapi_infer.py b/modules/gapi/misc/python/test/test_gapi_infer.py
index 8ecc957e41..d075651e87 100644
--- a/modules/gapi/misc/python/test/test_gapi_infer.py
+++ b/modules/gapi/misc/python/test/test_gapi_infer.py
@@ -38,8 +38,8 @@ try:
                 return
 
             root_path    = '/omz_intel_models/intel/age-gender-recognition-retail-0013/FP32/age-gender-recognition-retail-0013'
-            model_path   = self.find_file(root_path + '.xml',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
-            weights_path = self.find_file(root_path + '.bin',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
+            model_path   = self.find_file(root_path + '.xml',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')], required=False)
+            weights_path = self.find_file(root_path + '.bin',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')], required=False)
             device_id    = 'CPU'
 
             img_path  = self.find_file('cv/face/david2.jpg', [os.environ.get('OPENCV_TEST_DATA_PATH')])
@@ -73,8 +73,8 @@ try:
                 return
 
             root_path    = '/omz_intel_models/intel/age-gender-recognition-retail-0013/FP32/age-gender-recognition-retail-0013'
-            model_path   = self.find_file(root_path + '.xml',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
-            weights_path = self.find_file(root_path + '.bin',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
+            model_path   = self.find_file(root_path + '.xml',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')], required=False)
+            weights_path = self.find_file(root_path + '.bin',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')], required=False)
             device_id    = 'CPU'
 
             img_path = self.find_file('cv/face/david2.jpg', [os.environ.get('OPENCV_TEST_DATA_PATH')])
@@ -112,8 +112,8 @@ try:
                 return
 
             root_path    = '/omz_intel_models/intel/age-gender-recognition-retail-0013/FP32/age-gender-recognition-retail-0013'
-            model_path   = self.find_file(root_path + '.xml',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
-            weights_path = self.find_file(root_path + '.bin',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
+            model_path   = self.find_file(root_path + '.xml',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')], required=False)
+            weights_path = self.find_file(root_path + '.bin',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')], required=False)
             device_id    = 'CPU'
 
             rois = [(10, 15, 62, 62), (23, 50, 62, 62), (14, 100, 62, 62), (80, 50, 62, 62)]
@@ -161,8 +161,8 @@ try:
                 return
 
             root_path    = '/omz_intel_models/intel/age-gender-recognition-retail-0013/FP32/age-gender-recognition-retail-0013'
-            model_path   = self.find_file(root_path + '.xml',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
-            weights_path = self.find_file(root_path + '.bin',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
+            model_path   = self.find_file(root_path + '.xml',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')], required=False)
+            weights_path = self.find_file(root_path + '.bin',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')], required=False)
             device_id    = 'CPU'
 
             rois = [(10, 15, 62, 62), (23, 50, 62, 62), (14, 100, 62, 62), (80, 50, 62, 62)]
@@ -211,8 +211,8 @@ try:
                 return
 
             root_path    = '/omz_intel_models/intel/person-detection-retail-0013/FP32/person-detection-retail-0013'
-            model_path   = self.find_file(root_path + '.xml',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
-            weights_path = self.find_file(root_path + '.bin',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
+            model_path   = self.find_file(root_path + '.xml',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')], required=False)
+            weights_path = self.find_file(root_path + '.bin',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')], required=False)
             img_path     = self.find_file('gpu/lbpcascade/er.png', [os.environ.get('OPENCV_TEST_DATA_PATH')])
             device_id    = 'CPU'
             img          = cv.resize(cv.imread(img_path), (544, 320))
@@ -270,8 +270,8 @@ try:
                 return
 
             root_path    = '/omz_intel_models/intel/person-detection-retail-0013/FP32/person-detection-retail-0013'
-            model_path   = self.find_file(root_path + '.xml',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
-            weights_path = self.find_file(root_path + '.bin',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
+            model_path   = self.find_file(root_path + '.xml',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')], required=False)
+            weights_path = self.find_file(root_path + '.bin',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')], required=False)
             img_path     = self.find_file('gpu/lbpcascade/er.png', [os.environ.get('OPENCV_TEST_DATA_PATH')])
             device_id    = 'CPU'
             img          = cv.resize(cv.imread(img_path), (544, 320))
diff --git a/modules/gapi/misc/python/test/test_gapi_infer_ov.py b/modules/gapi/misc/python/test/test_gapi_infer_ov.py
index b4022b6e2d..f48ec96369 100644
--- a/modules/gapi/misc/python/test/test_gapi_infer_ov.py
+++ b/modules/gapi/misc/python/test/test_gapi_infer_ov.py
@@ -86,8 +86,8 @@ try:
             skip_if_openvino_not_available()
 
             root_path  = '/omz_intel_models/intel/age-gender-recognition-retail-0013/FP32/age-gender-recognition-retail-0013'
-            model_path = self.find_file(root_path + '.xml',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
-            bin_path   = self.find_file(root_path + '.bin',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
+            model_path = self.find_file(root_path + '.xml',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')], required=False)
+            bin_path   = self.find_file(root_path + '.bin',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')], required=False)
             device_id  = 'CPU'
 
             img_path = self.find_file('cv/face/david2.jpg', [os.environ.get('OPENCV_TEST_DATA_PATH')])
@@ -119,8 +119,8 @@ try:
             skip_if_openvino_not_available()
 
             root_path  = '/omz_intel_models/intel/age-gender-recognition-retail-0013/FP32/age-gender-recognition-retail-0013'
-            model_path = self.find_file(root_path + '.xml',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
-            bin_path   = self.find_file(root_path + '.bin',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
+            model_path = self.find_file(root_path + '.xml',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')], required=False)
+            bin_path   = self.find_file(root_path + '.bin',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')], required=False)
             device_id  = 'CPU'
 
             img_path = self.find_file('cv/face/david2.jpg', [os.environ.get('OPENCV_TEST_DATA_PATH')])
@@ -148,8 +148,8 @@ try:
             skip_if_openvino_not_available()
 
             root_path  = '/omz_intel_models/intel/age-gender-recognition-retail-0013/FP32/age-gender-recognition-retail-0013'
-            model_path = self.find_file(root_path + '.xml',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
-            bin_path   = self.find_file(root_path + '.bin',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
+            model_path = self.find_file(root_path + '.xml',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')], required=False)
+            bin_path   = self.find_file(root_path + '.bin',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')], required=False)
             device_id  = 'CPU'
 
             img_path1 = self.find_file('cv/face/david1.jpg', [os.environ.get('OPENCV_TEST_DATA_PATH')])
@@ -190,8 +190,8 @@ try:
             skip_if_openvino_not_available()
 
             root_path  = '/omz_intel_models/intel/age-gender-recognition-retail-0013/FP32/age-gender-recognition-retail-0013'
-            model_path = self.find_file(root_path + '.xml',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
-            bin_path   = self.find_file(root_path + '.bin',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
+            model_path = self.find_file(root_path + '.xml',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')], required=False)
+            bin_path   = self.find_file(root_path + '.bin',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')], required=False)
             device_id  = 'CPU'
 
             img_path = self.find_file('cv/face/david2.jpg', [os.environ.get('OPENCV_TEST_DATA_PATH')])
diff --git a/modules/gapi/src/api/gbackend.cpp b/modules/gapi/src/api/gbackend.cpp
index efbe17a305..46c8dc1640 100644
--- a/modules/gapi/src/api/gbackend.cpp
+++ b/modules/gapi/src/api/gbackend.cpp
@@ -36,7 +36,6 @@ cv::gapi::GBackend::Priv::compile(const ade::Graph&,
 {
     // ...and this method is here for the same reason!
     GAPI_Error("InternalError");
-    return {};
 }
 
 std::unique_ptr<cv::gimpl::GIslandExecutable>
@@ -224,7 +223,6 @@ void bindOutArg(Mag& mag, const RcDesc &rc, const GRunArgP &arg, HandleRMat hand
 
     default:
         util::throw_error(std::logic_error("Unsupported GShape type"));
-        break;
     }
 }
 
@@ -256,7 +254,6 @@ void resetInternalData(Mag& mag, const Data &d)
 
     default:
         util::throw_error(std::logic_error("Unsupported GShape type"));
-        break;
     }
 }
 
@@ -284,7 +281,6 @@ cv::GRunArg getArg(const Mag& mag, const RcDesc &ref)
                        mag.meta<cv::MediaFrame>().at(ref.id));
     default:
         util::throw_error(std::logic_error("Unsupported GShape type"));
-        break;
     }
 }
 
@@ -327,7 +323,6 @@ cv::GRunArgP getObjPtr(Mag& mag, const RcDesc &rc, bool is_umat)
 
     default:
         util::throw_error(std::logic_error("Unsupported GShape type"));
-        break;
     }
 }
 
@@ -359,7 +354,6 @@ void writeBack(const Mag& mag, const RcDesc &rc, GRunArgP &g_arg)
 
     default:
         util::throw_error(std::logic_error("Unsupported GShape type"));
-        break;
     }
 }
 
diff --git a/modules/gapi/src/compiler/gislandmodel.hpp b/modules/gapi/src/compiler/gislandmodel.hpp
index 3a1a8d5ab9..ade13a6f33 100644
--- a/modules/gapi/src/compiler/gislandmodel.hpp
+++ b/modules/gapi/src/compiler/gislandmodel.hpp
@@ -192,6 +192,7 @@ class GIslandEmitter
 public:
     // Obtain next value from the emitter
     virtual bool pull(GRunArg &) = 0;
+    virtual void halt() = 0;
     virtual ~GIslandEmitter() = default;
 };
 
diff --git a/modules/gapi/src/executor/gstreamingexecutor.cpp b/modules/gapi/src/executor/gstreamingexecutor.cpp
index 124b27f39c..6a397faca6 100644
--- a/modules/gapi/src/executor/gstreamingexecutor.cpp
+++ b/modules/gapi/src/executor/gstreamingexecutor.cpp
@@ -41,6 +41,10 @@ using namespace cv::gimpl::stream;
 class VideoEmitter final: public cv::gimpl::GIslandEmitter {
     cv::gapi::wip::IStreamSource::Ptr src;
 
+    virtual void halt() override {
+        src->halt();
+    }
+
     virtual bool pull(cv::GRunArg &arg) override {
         // FIXME: probably we can maintain a pool of (then) pre-allocated
         // buffers to avoid runtime allocations.
@@ -62,6 +66,10 @@ public:
 class ConstEmitter final: public cv::gimpl::GIslandEmitter {
     cv::GRunArg m_arg;
 
+    virtual void halt() override {
+        // Not used here, but in fact can be used.
+    }
+
     virtual bool pull(cv::GRunArg &arg) override {
         arg = const_cast<const cv::GRunArg&>(m_arg); // FIXME: variant workaround
         return true;
@@ -1918,6 +1926,11 @@ void cv::gimpl::GStreamingExecutor::stop()
     for (auto &q : m_emitter_queues) {
         q.push(stream::Cmd{stream::Stop{}});
     }
+    // Also kindly ask emitter object to halt to break the blocking src->pull()
+    // loop
+    for (auto &nh : m_emitters) {
+        m_gim.metadata(nh).get<Emitter>().object->halt();
+    }
 
     // Pull messages from the final queue to ensure completion
     Cmd cmd;
diff --git a/modules/gapi/src/streaming/queue_source.cpp b/modules/gapi/src/streaming/queue_source.cpp
new file mode 100644
index 0000000000..59fde09c44
--- /dev/null
+++ b/modules/gapi/src/streaming/queue_source.cpp
@@ -0,0 +1,98 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2023 Intel Corporation
+
+#include <chrono>
+#include <atomic>
+
+#include <ade/util/zip_range.hpp>
+
+#include <opencv2/gapi/streaming/queue_source.hpp>
+#include <opencv2/gapi/streaming/meta.hpp>
+
+#include "executor/conc_queue.hpp"
+
+namespace cv {
+namespace gapi {
+namespace wip {
+
+class QueueSourceBase::Priv {
+public:
+    explicit Priv(const cv::GMetaArg &meta) {
+        m = meta;
+        halted = false;
+    }
+
+    cv::GMetaArg m;
+    cv::gapi::own::concurrent_bounded_queue<cv::GRunArg> q;
+    int64_t c = 0;
+    std::atomic<bool> halted;
+};
+
+QueueSourceBase::QueueSourceBase(const cv::GMetaArg &m)
+    : m_priv(new Priv(m)) {
+}
+
+void QueueSourceBase::push(Data &&data) {
+
+    // Tag data with seq_id/ts
+    const auto now = std::chrono::system_clock::now();
+    const auto dur = std::chrono::duration_cast<std::chrono::microseconds>
+        (now.time_since_epoch());
+    data.meta[cv::gapi::streaming::meta_tag::timestamp] = int64_t{dur.count()};
+    data.meta[cv::gapi::streaming::meta_tag::seq_id]    = int64_t{m_priv->c++};
+
+    m_priv->q.push(data);
+}
+
+bool QueueSourceBase::pull(Data &data) {
+    m_priv->q.pop(data);
+
+    if (m_priv->halted) {
+        return false;
+    }
+    return true;
+}
+
+void QueueSourceBase::halt() {
+    m_priv->halted.store(true);
+    m_priv->q.push(cv::GRunArg{});
+}
+
+cv::GMetaArg QueueSourceBase::descr_of() const {
+    return m_priv->m;
+}
+
+QueueInput::QueueInput(const cv::GMetaArgs &args) {
+    for (auto &&m : args) {
+        m_sources.emplace_back(new cv::gapi::wip::QueueSourceBase(m));
+    }
+}
+
+void QueueInput::push(cv::GRunArgs &&args) {
+    GAPI_Assert(m_sources.size() == args.size());
+    for (auto && it : ade::util::zip(ade::util::toRange(m_sources),
+                                     ade::util::toRange(args)))
+    {
+        auto &src = std::get<0>(it);
+        auto &obj = std::get<1>(it);
+
+        Data d;
+        d = obj;
+        src->push(std::move(d));
+    }
+}
+
+QueueInput::operator cv::GRunArgs () {
+    cv::GRunArgs args;
+    for (auto &&s : m_sources) {
+        args.push_back(s->ptr());
+    }
+    return args;
+}
+
+} // wip
+} // gapi
+} // cv
diff --git a/modules/gapi/test/infer/gapi_infer_ie_test.cpp b/modules/gapi/test/infer/gapi_infer_ie_test.cpp
index 58e37040e8..92de39abfa 100644
--- a/modules/gapi/test/infer/gapi_infer_ie_test.cpp
+++ b/modules/gapi/test/infer/gapi_infer_ie_test.cpp
@@ -187,8 +187,8 @@ std::string compileAgeGenderBlob(const std::string& device) {
         cv::gapi::ie::detail::ParamDesc params;
         const std::string model_name = "age-gender-recognition-retail-0013";
         const std::string output  = model_name + ".blob";
-        params.model_path   = findDataFile(SUBDIR + model_name + ".xml");
-        params.weights_path = findDataFile(SUBDIR + model_name + ".bin");
+        params.model_path   = findDataFile(SUBDIR + model_name + ".xml", false);
+        params.weights_path = findDataFile(SUBDIR + model_name + ".bin", false);
         params.device_id    = device;
         compileBlob(params, output, IE::Precision::U8);
         return output;
@@ -205,8 +205,8 @@ TEST(TestAgeGenderIE, InferBasicTensor)
     initDLDTDataPath();
 
     cv::gapi::ie::detail::ParamDesc params;
-    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml");
-    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin");
+    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml", false);
+    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin", false);
     params.device_id = "CPU";
 
     // Load IE network, initialize input data using that.
@@ -256,8 +256,8 @@ TEST(TestAgeGenderIE, InferBasicImage)
     initDLDTDataPath();
 
     cv::gapi::ie::detail::ParamDesc params;
-    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml");
-    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin");
+    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml", false);
+    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin", false);
     params.device_id = "CPU";
 
     // FIXME: Ideally it should be an image from disk
@@ -334,8 +334,8 @@ struct InferWithReshape: public ::testing::Test {
         reshape_dims = {1, 3, 70, 70};
 
         initDLDTDataPath();
-        params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml");
-        params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin");
+        params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml", false);
+        params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin", false);
 
         params.device_id = "CPU";
 
@@ -432,8 +432,8 @@ struct ROIList: public ::testing::Test {
 
     void SetUp() {
         initDLDTDataPath();
-        params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml");
-        params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin");
+        params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml", false);
+        params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin", false);
         params.device_id = "CPU";
 
         // FIXME: it must be cv::imread(findDataFile("../dnn/grace_hopper_227.png", false));
@@ -505,8 +505,8 @@ struct ROIListNV12: public ::testing::Test {
 
     void SetUp() {
         initDLDTDataPath();
-        params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml");
-        params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin");
+        params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml", false);
+        params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin", false);
         params.device_id = "CPU";
 
         cv::Size sz{320, 240};
@@ -585,8 +585,8 @@ struct SingleROI: public ::testing::Test {
 
     void SetUp() {
         initDLDTDataPath();
-        params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml");
-        params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin");
+        params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml", false);
+        params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin", false);
         params.device_id = "CPU";
 
         // FIXME: it must be cv::imread(findDataFile("../dnn/grace_hopper_227.png", false));
@@ -644,8 +644,8 @@ struct SingleROINV12: public ::testing::Test {
 
     void SetUp() {
         initDLDTDataPath();
-        params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml");
-        params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin");
+        params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml", false);
+        params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin", false);
         params.device_id = "CPU";
 
         cv::Size sz{320, 240};
@@ -809,8 +809,8 @@ TEST(TestAgeGenderIE, GenericInfer)
     initDLDTDataPath();
 
     cv::gapi::ie::detail::ParamDesc params;
-    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml");
-    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin");
+    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml", false);
+    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin", false);
     params.device_id = "CPU";
 
     cv::Mat in_mat(cv::Size(320, 240), CV_8UC3);
@@ -859,8 +859,8 @@ TEST(TestAgeGenderIE, InvalidConfigGeneric)
 {
     initDLDTDataPath();
 
-    std::string model_path   = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml");
-    std::string weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin");
+    std::string model_path   = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml", false);
+    std::string weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin", false);
     std::string device_id    = "CPU";
 
     // Configure & run G-API
@@ -885,8 +885,8 @@ TEST(TestAgeGenderIE, CPUConfigGeneric)
 {
     initDLDTDataPath();
 
-    std::string model_path   = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml");
-    std::string weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin");
+    std::string model_path   = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml", false);
+    std::string weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin", false);
     std::string device_id    = "CPU";
 
     // Configure & run G-API
@@ -912,8 +912,8 @@ TEST(TestAgeGenderIE, InvalidConfig)
 {
     initDLDTDataPath();
 
-    std::string model_path   = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml");
-    std::string weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin");
+    std::string model_path   = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml", false);
+    std::string weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin", false);
     std::string device_id    = "CPU";
 
     using AGInfo = std::tuple<cv::GMat, cv::GMat>;
@@ -937,8 +937,8 @@ TEST(TestAgeGenderIE, CPUConfig)
 {
     initDLDTDataPath();
 
-    std::string model_path   = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml");
-    std::string weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin");
+    std::string model_path   = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml", false);
+    std::string weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin", false);
     std::string device_id    = "CPU";
 
     using AGInfo = std::tuple<cv::GMat, cv::GMat>;
@@ -1017,8 +1017,8 @@ TEST(TestAgeGenderIE, MediaInputNV12)
     initDLDTDataPath();
 
     cv::gapi::ie::detail::ParamDesc params;
-    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml");
-    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin");
+    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml", false);
+    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin", false);
     params.device_id = "CPU";
 
     cv::Size sz{320, 240};
@@ -1082,8 +1082,8 @@ TEST(TestAgeGenderIE, MediaInputBGR)
     initDLDTDataPath();
 
     cv::gapi::ie::detail::ParamDesc params;
-    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml");
-    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin");
+    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml", false);
+    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin", false);
     params.device_id = "CPU";
 
     cv::Size sz{320, 240};
@@ -1134,8 +1134,8 @@ TEST(InferROI, MediaInputBGR)
     initDLDTDataPath();
 
     cv::gapi::ie::detail::ParamDesc params;
-    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml");
-    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin");
+    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml", false);
+    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin", false);
     params.device_id = "CPU";
 
     cv::Size sz{320, 240};
@@ -1196,8 +1196,8 @@ TEST(InferROI, MediaInputNV12)
     initDLDTDataPath();
 
     cv::gapi::ie::detail::ParamDesc params;
-    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml");
-    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin");
+    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml", false);
+    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin", false);
     params.device_id = "CPU";
 
     cv::Size sz{320, 240};
@@ -1587,8 +1587,8 @@ TEST(Infer, TestStreamingInfer)
     std::string filepath = findDataFile("cv/video/768x576.avi");
 
     cv::gapi::ie::detail::ParamDesc params;
-    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml");
-    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin");
+    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml", false);
+    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin", false);
     params.device_id = "CPU";
 
     // Load IE network, initialize input data using that.
@@ -1654,8 +1654,8 @@ TEST(InferROI, TestStreamingInfer)
     std::string filepath = findDataFile("cv/video/768x576.avi");
 
     cv::gapi::ie::detail::ParamDesc params;
-    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml");
-    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin");
+    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml", false);
+    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin", false);
     params.device_id = "CPU";
 
     // Load IE network, initialize input data using that.
@@ -1732,8 +1732,8 @@ TEST(InferList, TestStreamingInfer)
     std::string filepath = findDataFile("cv/video/768x576.avi");
 
     cv::gapi::ie::detail::ParamDesc params;
-    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml");
-    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin");
+    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml", false);
+    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin", false);
     params.device_id = "CPU";
 
     // Load IE network, initialize input data using that.
@@ -1821,8 +1821,8 @@ TEST(Infer2, TestStreamingInfer)
     std::string filepath = findDataFile("cv/video/768x576.avi");
 
     cv::gapi::ie::detail::ParamDesc params;
-    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml");
-    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin");
+    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml", false);
+    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin", false);
     params.device_id = "CPU";
 
     // Load IE network, initialize input data using that.
@@ -1911,8 +1911,8 @@ TEST(InferEmptyList, TestStreamingInfer)
     std::string filepath = findDataFile("cv/video/768x576.avi");
 
     cv::gapi::ie::detail::ParamDesc params;
-    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml");
-    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin");
+    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml", false);
+    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin", false);
     params.device_id = "CPU";
 
     // Load IE network, initialize input data using that.
@@ -1965,8 +1965,8 @@ TEST(Infer2EmptyList, TestStreamingInfer)
     std::string filepath = findDataFile("cv/video/768x576.avi");
 
     cv::gapi::ie::detail::ParamDesc params;
-    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml");
-    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin");
+    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml", false);
+    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin", false);
     params.device_id = "CPU";
 
     // Load IE network, initialize input data using that.
@@ -2294,8 +2294,8 @@ struct LimitedSourceInfer: public ::testing::Test {
 
     GStreamingCompiled compileStreaming(int nireq) {
         cv::gapi::ie::detail::ParamDesc params;
-        params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml");
-        params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin");
+        params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml", false);
+        params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin", false);
         params.device_id = "CPU";
 
         auto pp = cv::gapi::ie::Params<AgeGender> {
@@ -2348,8 +2348,8 @@ TEST(TestAgeGenderIE, InferWithBatch)
 
     constexpr int batch_size = 4;
     cv::gapi::ie::detail::ParamDesc params;
-    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml");
-    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin");
+    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml", false);
+    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin", false);
     params.device_id = "CPU";
 
     cv::Mat in_mat({batch_size, 3, 62, 62}, CV_8U);
@@ -3091,8 +3091,8 @@ struct AgeGenderInferTest: public ::testing::Test {
 
     void SetUp() {
         initDLDTDataPath();
-        m_params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml");
-        m_params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin");
+        m_params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml", false);
+        m_params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin", false);
         m_params.device_id = "CPU";
 
         m_plugin = cv::gimpl::ie::wrap::getPlugin(m_params);
@@ -3191,8 +3191,8 @@ TEST(TestAgeGenderIE, InferTensorWithPreproc) {
     initDLDTDataPath();
 
     cv::gapi::ie::detail::ParamDesc params;
-    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml");
-    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin");
+    params.model_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml", false);
+    params.weights_path = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin", false);
     params.device_id = "CPU";
 
     // Load IE network, initialize input data using that.
diff --git a/modules/gapi/test/infer/gapi_infer_ov_tests.cpp b/modules/gapi/test/infer/gapi_infer_ov_tests.cpp
index 09b54c1a46..abce82b329 100644
--- a/modules/gapi/test/infer/gapi_infer_ov_tests.cpp
+++ b/modules/gapi/test/infer/gapi_infer_ov_tests.cpp
@@ -255,8 +255,8 @@ private:
 struct BaseAgeGenderOV: public ::testing::Test {
     BaseAgeGenderOV() {
         initDLDTDataPath();
-        xml_path  = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml");
-        bin_path  = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin");
+        xml_path  = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.xml", false);
+        bin_path  = findDataFile(SUBDIR + "age-gender-recognition-retail-0013.bin", false);
         device    = "CPU";
         blob_path = "age-gender-recognition-retail-0013.blob";
     }
diff --git a/modules/gapi/test/streaming/gapi_streaming_queue_source_tests.cpp b/modules/gapi/test/streaming/gapi_streaming_queue_source_tests.cpp
new file mode 100644
index 0000000000..093e654715
--- /dev/null
+++ b/modules/gapi/test/streaming/gapi_streaming_queue_source_tests.cpp
@@ -0,0 +1,127 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2023 Intel Corporation
+
+
+#include "../test_precomp.hpp"
+
+#include <opencv2/gapi/gstreaming.hpp>
+#include <opencv2/gapi/streaming/queue_source.hpp>
+#include <opencv2/gapi/streaming/cap.hpp>
+
+namespace opencv_test
+{
+
+TEST(GAPI_Streaming_Queue_Source, SmokeTest) {
+    // This is more like an example on G-API Queue Source
+
+    cv::GMat in;
+    cv::GMat out = in + 1;
+    cv::GStreamingCompiled comp = cv::GComputation(in, out).compileStreaming();
+
+    // Queue source needs to know format information to maintain contracts
+    auto src = std::make_shared<cv::gapi::wip::QueueSource<cv::Mat> >
+        (cv::GMatDesc{CV_8U, 1, cv::Size{128, 128}});
+
+    comp.setSource(cv::gin(src->ptr()));
+    comp.start();
+
+    // It is perfectly legal to start a pipeline at this point - the source was passed.
+    // Now we can push data through the source and get the pipeline results.
+
+    cv::Mat eye = cv::Mat::eye(cv::Size{128, 128}, CV_8UC1);
+    src->push(eye);    // Push I (identity matrix)
+    src->push(eye*2);  // Push I*2
+
+    // Now its time to pop. The data could be already processed at this point.
+    // Note the queue source queues are unbounded to avoid deadlocks
+
+    cv::Mat result;
+    ASSERT_TRUE(comp.pull(cv::gout(result)));
+    EXPECT_EQ(0, cvtest::norm(eye + 1, result, NORM_INF));
+
+    ASSERT_TRUE(comp.pull(cv::gout(result)));
+    EXPECT_EQ(0, cvtest::norm(eye*2 + 1, result, NORM_INF));
+}
+
+TEST(GAPI_Streaming_Queue_Source, Mixed) {
+    // Mixing a regular "live" source (which runs on its own) with a
+    // manually controlled queue source may make a little sense, but
+    // is perfectly legal and possible.
+
+    cv::GMat in1;
+    cv::GMat in2;
+    cv::GMat out = in2 - in1;
+    cv::GStreamingCompiled comp = cv::GComputation(in1, in2, out).compileStreaming();
+
+    // Queue source needs to know format information to maintain contracts
+    auto src1 = std::make_shared<cv::gapi::wip::QueueSource<cv::Mat> >
+        (cv::GMatDesc{CV_8U, 3, cv::Size{768, 576}});
+
+    std::shared_ptr<cv::gapi::wip::IStreamSource> src2;
+    auto path = findDataFile("cv/video/768x576.avi");
+    try {
+        src2 = cv::gapi::wip::make_src<cv::gapi::wip::GCaptureSource>(path);
+    } catch(...) {
+        throw SkipTestException("Video file can not be opened");
+    }
+
+    comp.setSource(cv::gin(src1->ptr(), src2)); // FIXME: quite inconsistent
+    comp.start();
+
+    cv::Mat eye = cv::Mat::eye(cv::Size{768, 576}, CV_8UC3);
+    src1->push(eye);    // Push I (identity matrix)
+    src1->push(eye);    // Push I (again)
+
+    cv::Mat ref, result;
+    cv::VideoCapture cap(path);
+
+    cap >> ref;
+    ASSERT_TRUE(comp.pull(cv::gout(result)));
+    EXPECT_EQ(0, cvtest::norm(ref - eye, result, NORM_INF));
+
+    cap >> ref;
+    ASSERT_TRUE(comp.pull(cv::gout(result)));
+    EXPECT_EQ(0, cvtest::norm(ref - eye, result, NORM_INF));
+}
+
+TEST(GAPI_Streaming_Queue_Input, SmokeTest) {
+
+    // Queue Input: a tiny wrapper atop of multiple queue sources.
+    // Allows users to pass all input data at once.
+
+    cv::GMat in1;
+    cv::GScalar in2;
+    cv::GMat out = in1 + in2;
+    cv::GStreamingCompiled comp = cv::GComputation(cv::GIn(in1, in2), cv::GOut(out))
+        .compileStreaming();
+
+    // FIXME: This API is too raw
+    cv::gapi::wip::QueueInput input({
+            cv::GMetaArg{ cv::GMatDesc{CV_8U, 1, cv::Size{64,64} } },
+            cv::GMetaArg{ cv::empty_scalar_desc() }
+        });
+    comp.setSource(input); // Implicit conversion allows it to be passed as-is.
+    comp.start();
+
+    // Push data via queue input
+    cv::Mat eye = cv::Mat::eye(cv::Size{64, 64}, CV_8UC1);
+    input.push(cv::gin(eye, cv::Scalar(1)));
+    input.push(cv::gin(eye, cv::Scalar(2)));
+    input.push(cv::gin(eye, cv::Scalar(3)));
+
+    // Pop data and validate
+    cv::Mat result;
+    ASSERT_TRUE(comp.pull(cv::gout(result)));
+    EXPECT_EQ(0, cvtest::norm(eye+1, result, NORM_INF));
+
+    ASSERT_TRUE(comp.pull(cv::gout(result)));
+    EXPECT_EQ(0, cvtest::norm(eye+2, result, NORM_INF));
+
+    ASSERT_TRUE(comp.pull(cv::gout(result)));
+    EXPECT_EQ(0, cvtest::norm(eye+3, result, NORM_INF));
+}
+
+} // namespace opencv_test
diff --git a/modules/highgui/src/window_cocoa.mm b/modules/highgui/src/window_cocoa.mm
index a4b62f2717..8dc640fdd6 100644
--- a/modules/highgui/src/window_cocoa.mm
+++ b/modules/highgui/src/window_cocoa.mm
@@ -184,6 +184,9 @@ void destroyWindowImpl( const char* name)
     //cout << "destroyWindowImpl" << endl;
     CVWindow *window = cvGetWindow(name);
     if(window) {
+        if ([window styleMask] & NSFullScreenWindowMask) {
+            [window toggleFullScreen:nil];
+        }
         [window close];
         [windows removeObjectForKey:[NSString stringWithFormat:@"%s", name]];
     }
@@ -668,7 +671,11 @@ double cvGetModeWindow_COCOA( const char* name )
 void cvSetModeWindow_COCOA( const char* name, double prop_value )
 {
     CVWindow *window = nil;
+
+#if MAC_OS_X_VERSION_MAX_ALLOWED < MAC_OS_X_VERSION_10_7
     NSDictionary *fullscreenOptions = nil;
+#endif
+
     NSAutoreleasePool* localpool = nil;
 
     CV_FUNCNAME( "cvSetModeWindow_COCOA" );
@@ -692,6 +699,31 @@ void cvSetModeWindow_COCOA( const char* name, double prop_value )
 
     localpool = [[NSAutoreleasePool alloc] init];
 
+#if MAC_OS_X_VERSION_MAX_ALLOWED > MAC_OS_X_VERSION_10_6
+    if ( ([window styleMask] & NSFullScreenWindowMask) && prop_value==cv::WINDOW_NORMAL )
+    {
+        [window toggleFullScreen:nil];
+
+        window.status=cv::WINDOW_NORMAL;
+    }
+    else if( !([window styleMask] & NSFullScreenWindowMask) && prop_value==cv::WINDOW_FULLSCREEN )
+    {
+        [window setCollectionBehavior:NSWindowCollectionBehaviorFullScreenPrimary];
+
+        NSScreen* screen = [window screen];
+
+        NSRect frame = [screen frame];
+        [window setFrame:frame display:YES];
+
+        [window setContentSize:frame.size];
+
+        [window toggleFullScreen:nil];
+
+        [window setFrameTopLeftPoint: frame.origin];
+
+        window.status=cv::WINDOW_FULLSCREEN;
+    }
+#else
     fullscreenOptions = [NSDictionary dictionaryWithObject:[NSNumber numberWithBool:YES] forKey:NSFullScreenModeSetting];
     if ( [[window contentView] isInFullScreenMode] && prop_value==cv::WINDOW_NORMAL )
     {
@@ -703,7 +735,7 @@ void cvSetModeWindow_COCOA( const char* name, double prop_value )
         [[window contentView] enterFullScreenMode:[NSScreen mainScreen] withOptions:fullscreenOptions];
         window.status=cv::WINDOW_FULLSCREEN;
     }
-
+#endif
     [localpool drain];
 
     __END__;
@@ -777,7 +809,7 @@ void cvSetPropTopmost_COCOA( const char* name, const bool topmost )
         CV_ERROR( CV_StsNullPtr, "NULL window" );
     }
 
-    if ([[window contentView] isInFullScreenMode])
+    if (([window styleMask] & NSFullScreenWindowMask))
     {
         EXIT;
     }
diff --git a/modules/imgcodecs/src/grfmt_tiff.cpp b/modules/imgcodecs/src/grfmt_tiff.cpp
index ed21f3f14c..4febee36db 100644
--- a/modules/imgcodecs/src/grfmt_tiff.cpp
+++ b/modules/imgcodecs/src/grfmt_tiff.cpp
@@ -245,7 +245,7 @@ bool TiffDecoder::readHeader()
     if (!tif)
     {
         // TIFFOpen() mode flags are different to fopen().  A 'b' in mode "rb" has no effect when reading.
-        // http://www.remotesensing.org/libtiff/man/TIFFOpen.3tiff.html
+        // http://www.simplesystems.org/libtiff/functions/TIFFOpen.html
         if ( !m_buf.empty() )
         {
             m_buf_pos = 0;
@@ -1118,7 +1118,7 @@ public:
     TIFF* open ()
     {
         // do NOT put "wb" as the mode, because the b means "big endian" mode, not "binary" mode.
-        // http://www.remotesensing.org/libtiff/man/TIFFOpen.3tiff.html
+        // http://www.simplesystems.org/libtiff/functions/TIFFOpen.html
         return TIFFClientOpen( "", "w", reinterpret_cast<thandle_t>(this), &TiffEncoderBufHelper::read,
                                &TiffEncoderBufHelper::write, &TiffEncoderBufHelper::seek,
                                &TiffEncoderBufHelper::close, &TiffEncoderBufHelper::size,
@@ -1200,7 +1200,7 @@ static bool readParam(const std::vector<int>& params, int key, int& value)
 bool TiffEncoder::writeLibTiff( const std::vector<Mat>& img_vec, const std::vector<int>& params)
 {
     // do NOT put "wb" as the mode, because the b means "big endian" mode, not "binary" mode.
-    // http://www.remotesensing.org/libtiff/man/TIFFOpen.3tiff.html
+    // http://www.simplesystems.org/libtiff/functions/TIFFOpen.html
     TIFF* tif = NULL;
 
     TiffEncoderBufHelper buf_helper(m_buf);
diff --git a/modules/imgproc/CMakeLists.txt b/modules/imgproc/CMakeLists.txt
index d85b95ed26..1c033c96fd 100644
--- a/modules/imgproc/CMakeLists.txt
+++ b/modules/imgproc/CMakeLists.txt
@@ -14,10 +14,12 @@ ocv_define_module(imgproc opencv_core WRAP java objc python js)
 
 ocv_module_include_directories(opencv_imgproc ${ZLIB_INCLUDE_DIRS})
 
-ocv_check_environment_variables(OPENCV_IPP_GAUSSIAN_BLUR)
-option(OPENCV_IPP_GAUSSIAN_BLUR "Enable IPP optimizations for GaussianBlur (+8Mb in binary size)" OFF)
-if(OPENCV_IPP_GAUSSIAN_BLUR)
-  ocv_append_source_file_compile_definitions(${CMAKE_CURRENT_SOURCE_DIR}/src/smooth.dispatch.cpp "ENABLE_IPP_GAUSSIAN_BLUR=1")
+if(HAVE_IPP)
+  # OPENCV_IPP_ENABLE_ALL is defined in modules/core/CMakeList.txt
+  OCV_OPTION(OPENCV_IPP_GAUSSIAN_BLUR "Enable IPP optimizations for GaussianBlur (+8Mb in binary size)" OPENCV_IPP_ENABLE_ALL)
+  if(OPENCV_IPP_GAUSSIAN_BLUR)
+    ocv_append_source_file_compile_definitions(${CMAKE_CURRENT_SOURCE_DIR}/src/smooth.dispatch.cpp "ENABLE_IPP_GAUSSIAN_BLUR=1")
+  endif()
 endif()
 
 set(UNIFONT_MD5 "fb79cf5b4f4c89414f1233f14c2eb273")
diff --git a/modules/imgproc/src/distransform.cpp b/modules/imgproc/src/distransform.cpp
index 57940935d4..adb0359c07 100755
--- a/modules/imgproc/src/distransform.cpp
+++ b/modules/imgproc/src/distransform.cpp
@@ -78,7 +78,7 @@ distanceTransform_3x3( const Mat& _src, Mat& _temp, Mat& _dist, const float* met
 
     const uchar* src = _src.ptr();
     int* temp = _temp.ptr<int>();
-    float* dist = _dist.ptr<float>();
+    float* dist = _dist.ptr<float>(_dist.rows - 1);
     int srcstep = (int)(_src.step/sizeof(src[0]));
     int step = (int)(_temp.step/sizeof(temp[0]));
     int dststep = (int)(_dist.step/sizeof(dist[0]));
@@ -87,11 +87,10 @@ distanceTransform_3x3( const Mat& _src, Mat& _temp, Mat& _dist, const float* met
     initTopBottom( _temp, BORDER );
 
     // forward pass
+    unsigned int* tmp = (unsigned int*)(temp + BORDER*step) + BORDER;
+    const uchar* s = src;
     for( i = 0; i < size.height; i++ )
     {
-        const uchar* s = src + i*srcstep;
-        unsigned int* tmp = (unsigned int*)(temp + (i+BORDER)*step) + BORDER;
-
         for( j = 0; j < BORDER; j++ )
             tmp[-j-1] = tmp[size.width + j] = INIT_DIST0;
 
@@ -111,13 +110,15 @@ distanceTransform_3x3( const Mat& _src, Mat& _temp, Mat& _dist, const float* met
                 tmp[j] = t0;
             }
         }
+        tmp += step;
+        s += srcstep;
     }
 
     // backward pass
+    float* d = (float*)dist;
     for( i = size.height - 1; i >= 0; i-- )
     {
-        float* d = (float*)(dist + i*dststep);
-        unsigned int* tmp = (unsigned int*)(temp + (i+BORDER)*step) + BORDER;
+        tmp -= step;
 
         for( j = size.width - 1; j >= 0; j-- )
         {
@@ -137,6 +138,7 @@ distanceTransform_3x3( const Mat& _src, Mat& _temp, Mat& _dist, const float* met
             t0 = (t0 > DIST_MAX) ? DIST_MAX : t0;
             d[j] = (float)(t0 * scale);
         }
+        d -= dststep;
     }
 }
 
@@ -153,7 +155,7 @@ distanceTransform_5x5( const Mat& _src, Mat& _temp, Mat& _dist, const float* met
 
     const uchar* src = _src.ptr();
     int* temp = _temp.ptr<int>();
-    float* dist = _dist.ptr<float>();
+    float* dist = _dist.ptr<float>(_dist.rows - 1);
     int srcstep = (int)(_src.step/sizeof(src[0]));
     int step = (int)(_temp.step/sizeof(temp[0]));
     int dststep = (int)(_dist.step/sizeof(dist[0]));
@@ -162,11 +164,10 @@ distanceTransform_5x5( const Mat& _src, Mat& _temp, Mat& _dist, const float* met
     initTopBottom( _temp, BORDER );
 
     // forward pass
+    unsigned int* tmp = (unsigned int*)(temp + BORDER*step) + BORDER;
+    const uchar* s = src;
     for( i = 0; i < size.height; i++ )
     {
-        const uchar* s = src + i*srcstep;
-        unsigned int* tmp = (unsigned int*)(temp + (i+BORDER)*step) + BORDER;
-
         for( j = 0; j < BORDER; j++ )
             tmp[-j-1] = tmp[size.width + j] = INIT_DIST0;
 
@@ -194,13 +195,15 @@ distanceTransform_5x5( const Mat& _src, Mat& _temp, Mat& _dist, const float* met
                 tmp[j] = t0;
             }
         }
+        tmp += step;
+        s += srcstep;
     }
 
     // backward pass
+    float* d = (float*)dist;
     for( i = size.height - 1; i >= 0; i-- )
     {
-        float* d = (float*)(dist + i*dststep);
-        unsigned int* tmp = (unsigned int*)(temp + (i+BORDER)*step) + BORDER;
+        tmp -= step;
 
         for( j = size.width - 1; j >= 0; j-- )
         {
@@ -228,6 +231,7 @@ distanceTransform_5x5( const Mat& _src, Mat& _temp, Mat& _dist, const float* met
             t0 = (t0 > DIST_MAX) ? DIST_MAX : t0;
             d[j] = (float)(t0 * scale);
         }
+        d -= dststep;
     }
 }
 
@@ -245,7 +249,7 @@ distanceTransformEx_5x5( const Mat& _src, Mat& _temp, Mat& _dist, Mat& _labels,
 
     const uchar* src = _src.ptr();
     int* temp = _temp.ptr<int>();
-    float* dist = _dist.ptr<float>();
+    float* dist = _dist.ptr<float>(_dist.rows - 1);
     int* labels = _labels.ptr<int>();
     int srcstep = (int)(_src.step/sizeof(src[0]));
     int step = (int)(_temp.step/sizeof(temp[0]));
@@ -256,12 +260,11 @@ distanceTransformEx_5x5( const Mat& _src, Mat& _temp, Mat& _dist, Mat& _labels,
     initTopBottom( _temp, BORDER );
 
     // forward pass
+    const uchar* s = src;
+    unsigned int* tmp = (unsigned int*)(temp + BORDER*step) + BORDER;
+    int* lls = (int*)labels;
     for( i = 0; i < size.height; i++ )
     {
-        const uchar* s = src + i*srcstep;
-        unsigned int* tmp = (unsigned int*)(temp + (i+BORDER)*step) + BORDER;
-        int* lls = (int*)(labels + i*lstep);
-
         for( j = 0; j < BORDER; j++ )
             tmp[-j-1] = tmp[size.width + j] = INIT_DIST0;
 
@@ -330,14 +333,17 @@ distanceTransformEx_5x5( const Mat& _src, Mat& _temp, Mat& _dist, Mat& _labels,
                 lls[j] = l0;
             }
         }
+        s += srcstep;
+        tmp += step;
+        lls += lstep;
     }
 
     // backward pass
+    float* d = (float*)dist;
     for( i = size.height - 1; i >= 0; i-- )
     {
-        float* d = (float*)(dist + i*dststep);
-        unsigned int* tmp = (unsigned int*)(temp + (i+BORDER)*step) + BORDER;
-        int* lls = (int*)(labels + i*lstep);
+        tmp -= step;
+        lls -= lstep;
 
         for( j = size.width - 1; j >= 0; j-- )
         {
@@ -399,6 +405,7 @@ distanceTransformEx_5x5( const Mat& _src, Mat& _temp, Mat& _dist, Mat& _labels,
             t0 = (t0 > DIST_MAX) ? DIST_MAX : t0;
             d[j] = (float)(t0 * scale);
         }
+        d -= dststep;
     }
 }
 
diff --git a/modules/imgproc/src/geometry.cpp b/modules/imgproc/src/geometry.cpp
index 9536514b7d..3a40caecf1 100644
--- a/modules/imgproc/src/geometry.cpp
+++ b/modules/imgproc/src/geometry.cpp
@@ -269,17 +269,16 @@ static LineSegmentIntersection parallelInt( Point2f a, Point2f b, Point2f c, Poi
 static LineSegmentIntersection intersectLineSegments( Point2f a, Point2f b, Point2f c,
                                                       Point2f d, Point2f& p, Point2f& q )
 {
-    double denom = a.x * (double)(d.y - c.y) + b.x * (double)(c.y - d.y) +
-                   d.x * (double)(b.y - a.y) + c.x * (double)(a.y - b.y);
+    double denom = (a.x - b.x) * (double)(d.y - c.y) - (a.y - b.y) * (double)(d.x - c.x);
 
     // If denom is zero, then segments are parallel: handle separately.
     if( denom == 0. )
         return parallelInt(a, b, c, d, p, q);
 
-    double num = a.x * (double)(d.y - c.y) + c.x * (double)(a.y - d.y) + d.x * (double)(c.y - a.y);
+    double num = (d.y - a.y) * (double)(a.x - c.x) + (a.x - d.x) * (double)(a.y - c.y);
     double s = num / denom;
 
-    num = a.x * (double)(b.y - c.y) + b.x * (double)(c.y - a.y) + c.x * (double)(a.y - b.y);
+    num = (b.y - a.y) * (double)(a.x - c.x) + (c.y - a.y) * (double)(b.x - a.x);
     double t = num / denom;
 
     p.x = (float)(a.x + s*(b.x - a.x));
diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp
index 2164639127..a1441fd44f 100644
--- a/modules/imgproc/src/imgwarp.cpp
+++ b/modules/imgproc/src/imgwarp.cpp
@@ -2679,8 +2679,13 @@ static bool ipp_warpAffine( InputArray _src, OutputArray _dst, int interpolation
     }
 
     return true;
+#else
+    CV_UNUSED(_src); CV_UNUSED(_dst); CV_UNUSED(interpolation);
+    CV_UNUSED(borderType); CV_UNUSED(_M); CV_UNUSED(flags);
+    return false;
 #endif
 }
+
 #endif
 
 namespace hal {
diff --git a/modules/imgproc/test/test_distancetransform.cpp b/modules/imgproc/test/test_distancetransform.cpp
index e8b9a8cb06..b9d480e524 100644
--- a/modules/imgproc/test/test_distancetransform.cpp
+++ b/modules/imgproc/test/test_distancetransform.cpp
@@ -104,4 +104,28 @@ TEST(Imgproc_DistanceTransform, large_square_22732)
     EXPECT_EQ(0, nerrs) << "reference distance map is different from computed one at " << nerrs << " pixels\n";
 }
 
+BIGDATA_TEST(Imgproc_DistanceTransform, issue_23895_3x3)
+{
+    Mat src = Mat::zeros(50000, 50000, CV_8U), dist;
+    distanceTransform(src.col(0), dist, DIST_L2, DIST_MASK_3);
+    int nz = countNonZero(dist);
+    EXPECT_EQ(nz, 0);
+}
+
+BIGDATA_TEST(Imgproc_DistanceTransform, issue_23895_5x5)
+{
+    Mat src = Mat::zeros(50000, 50000, CV_8U), dist;
+    distanceTransform(src.col(0), dist, DIST_L2, DIST_MASK_5);
+    int nz = countNonZero(dist);
+    EXPECT_EQ(nz, 0);
+}
+
+BIGDATA_TEST(Imgproc_DistanceTransform, issue_23895_5x5_labels)
+{
+    Mat src = Mat::zeros(50000, 50000, CV_8U), dist, labels;
+    distanceTransform(src.col(0), dist, labels, DIST_L2, DIST_MASK_5);
+    int nz = countNonZero(dist);
+    EXPECT_EQ(nz, 0);
+}
+
 }} // namespace
diff --git a/modules/js/test/test_core.js b/modules/js/test/test_core.js
new file mode 100644
index 0000000000..14d4ffe72b
--- /dev/null
+++ b/modules/js/test/test_core.js
@@ -0,0 +1,41 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+if (typeof module !== 'undefined' && module.exports) {
+    // The environment is Node.js
+    var cv = require('./opencv.js'); // eslint-disable-line no-var
+}
+
+QUnit.module('Core', {});
+
+QUnit.test('test_LUT', function(assert) {
+    // test LUT
+    {
+        let src = cv.matFromArray(3, 3, cv.CV_8UC1, [255, 128, 0, 0, 128, 255, 1, 2, 254]);
+        let lutTable = [];
+        for (let i = 0; i < 256; i++)
+        {
+           lutTable[i] = 255 - i;
+        }
+        let lut = cv.matFromArray(1, 256, cv.CV_8UC1, lutTable);
+        let dst = new cv.Mat();
+
+        cv.LUT(src, lut, dst);
+
+        //console.log(dst.data);
+        assert.equal(dst.ucharAt(0), 0);
+        assert.equal(dst.ucharAt(1), 127);
+        assert.equal(dst.ucharAt(2), 255);
+        assert.equal(dst.ucharAt(3), 255);
+        assert.equal(dst.ucharAt(4), 127);
+        assert.equal(dst.ucharAt(5), 0);
+        assert.equal(dst.ucharAt(6), 254);
+        assert.equal(dst.ucharAt(7), 253);
+        assert.equal(dst.ucharAt(8), 1);
+
+        src.delete();
+        lut.delete();
+        dst.delete();
+    }
+});
diff --git a/modules/js/test/test_mat.js b/modules/js/test/test_mat.js
index 409ed1b123..fd3611cd2c 100644
--- a/modules/js/test/test_mat.js
+++ b/modules/js/test/test_mat.js
@@ -73,7 +73,7 @@ if (typeof module !== 'undefined' && module.exports) {
     var cv = require('./opencv.js'); // eslint-disable-line no-var
 }
 
-QUnit.module('Core', {});
+QUnit.module('CoreMat', {});
 
 QUnit.test('test_mat_creation', function(assert) {
     // Mat constructors.
diff --git a/modules/js/test/tests.html b/modules/js/test/tests.html
index de64ca7a29..b20013ec63 100644
--- a/modules/js/test/tests.html
+++ b/modules/js/test/tests.html
@@ -52,12 +52,12 @@
               if (window.cv instanceof Promise) {
                 window.cv.then((target) => {
                    window.cv = target;
-                   //console.log(cv.getBuildInformation());
+                   console.log(cv.getBuildInformation());
                    QUnit.start();
                 })
               } else {
                 // for backward compatible
-                // console.log(cv.getBuildInformation());
+                console.log(cv.getBuildInformation());
                 QUnit.start();
               }
             },
@@ -108,6 +108,7 @@
         <script type="application/javascript" async src="opencv.js" onerror="opencvjs_LoadError()"></script>
         <script type="application/javascript" src="test_mat.js"></script>
         <script type="application/javascript" src="test_utils.js"></script>
+        <script type="application/javascript" src="test_core.js"></script>
         <script type="application/javascript" src="test_imgproc.js"></script>
         <script type="application/javascript" src="test_objdetect.js"></script>
         <script type="application/javascript" src="test_video.js"></script>
diff --git a/modules/js/test/tests.js b/modules/js/test/tests.js
index f3156f6ea0..74a4b87e45 100644
--- a/modules/js/test/tests.js
+++ b/modules/js/test/tests.js
@@ -44,10 +44,15 @@ testrunner.options.maxBlockDuration = 20000; // cause opencv_js.js need time to
 testrunner.run(
     {
         code: 'opencv.js',
-        tests: ['test_mat.js', 'test_utils.js', 'test_imgproc.js',
-                'test_objdetect.js', 'test_video.js', 'test_features2d.js',
+        tests: ['test_mat.js',
+                'test_utils.js',
+                'test_core.js',
+                'test_imgproc.js',
+                'test_objdetect.js',
+                'test_video.js',
+                'test_features2d.js',
                 'test_photo.js',
-                'test_calib3d.js'
+                'test_calib3d.js',
         ],
     },
     function(err, report) {
diff --git a/modules/objdetect/misc/python/test/test_objdetect_aruco.py b/modules/objdetect/misc/python/test/test_objdetect_aruco.py
index d63a19cd2f..dda58b6460 100644
--- a/modules/objdetect/misc/python/test/test_objdetect_aruco.py
+++ b/modules/objdetect/misc/python/test/test_objdetect_aruco.py
@@ -186,6 +186,39 @@ class aruco_objdetect_test(NewOpenCVTests):
         self.assertEqual((1, 4, 2), refine_corners[0].shape)
         np.testing.assert_array_equal(corners, refine_corners)
 
+    def test_charuco_refine(self):
+        aruco_dict = cv.aruco.getPredefinedDictionary(cv.aruco.DICT_6X6_50)
+        board_size = (3, 4)
+        board = cv.aruco.CharucoBoard(board_size, 1., .7, aruco_dict)
+        aruco_detector = cv.aruco.ArucoDetector(aruco_dict)
+        charuco_detector = cv.aruco.CharucoDetector(board)
+        cell_size = 100
+        image = board.generateImage((cell_size*board_size[0], cell_size*board_size[1]))
+        camera = np.array([[1, 0, 0.5],
+                           [0, 1, 0.5],
+                           [0, 0, 1]])
+        dist = np.array([0, 0, 0, 0, 0], dtype=np.float32).reshape(1, -1)
+
+        # generate gold corners of the ArUco markers for the test
+        gold_corners = np.array(board.getObjPoints())[:, :, 0:2]*cell_size
+
+        # detect corners
+        markerCorners, markerIds, _ = aruco_detector.detectMarkers(image)
+
+        # test refine
+        rejected = [markerCorners[-1]]
+        markerCorners, markerIds = markerCorners[:-1], markerIds[:-1]
+        markerCorners, markerIds, _, _ = aruco_detector.refineDetectedMarkers(image, board, markerCorners, markerIds,
+                                                                              rejected, cameraMatrix=camera, distCoeffs=dist)
+
+        charucoCorners, charucoIds, _, _ = charuco_detector.detectBoard(image, markerCorners=markerCorners,
+                                                                        markerIds=markerIds)
+        self.assertEqual(len(charucoIds), 6)
+        self.assertEqual(len(markerIds), 6)
+
+        for i, id in enumerate(markerIds.reshape(-1)):
+            np.testing.assert_allclose(gold_corners[id], markerCorners[i].reshape(4, 2), 0.01, 1.)
+
     def test_write_read_dictionary(self):
         try:
             aruco_dict = cv.aruco.getPredefinedDictionary(cv.aruco.DICT_5X5_50)
diff --git a/modules/objdetect/src/aruco/aruco_detector.cpp b/modules/objdetect/src/aruco/aruco_detector.cpp
index 4b3af1b2c7..a62ca10faa 100644
--- a/modules/objdetect/src/aruco/aruco_detector.cpp
+++ b/modules/objdetect/src/aruco/aruco_detector.cpp
@@ -1000,7 +1000,13 @@ static inline void _projectUndetectedMarkers(const Board &board, InputOutputArra
                                              OutputArray undetectedMarkersIds) {
     Mat rvec, tvec; // first estimate board pose with the current avaible markers
     Mat objPoints, imgPoints; // object and image points for the solvePnP function
-    board.matchImagePoints(detectedCorners, detectedIds, objPoints, imgPoints);
+    // To refine corners of ArUco markers the function refineDetectedMarkers() find an aruco markers pose from 3D-2D point correspondences.
+    // To find 3D-2D point correspondences uses matchImagePoints().
+    // The method matchImagePoints() works with ArUco corners (in Board/GridBoard cases) or with ChArUco corners (in CharucoBoard case).
+    // To refine corners of ArUco markers we need work with ArUco corners only in all boards.
+    // To call matchImagePoints() with ArUco corners for all boards we need to call matchImagePoints() from base class Board.
+    // The method matchImagePoints() implemented in Pimpl and we need to create temp Board object to call the base method.
+    Board(board.getObjPoints(), board.getDictionary(), board.getIds()).matchImagePoints(detectedCorners, detectedIds, objPoints, imgPoints);
     if (objPoints.total() < 4ull) // at least one marker from board so rvec and tvec are valid
         return;
     solvePnP(objPoints, imgPoints, cameraMatrix, distCoeffs, rvec, tvec);
diff --git a/modules/objdetect/src/aruco/aruco_dictionary.cpp b/modules/objdetect/src/aruco/aruco_dictionary.cpp
index f73cea3357..3d5f9b1bfd 100644
--- a/modules/objdetect/src/aruco/aruco_dictionary.cpp
+++ b/modules/objdetect/src/aruco/aruco_dictionary.cpp
@@ -355,6 +355,7 @@ static int _getSelfDistance(const Mat &marker) {
 
 
 Dictionary extendDictionary(int nMarkers, int markerSize, const Dictionary &baseDictionary, int randomSeed) {
+    CV_Assert(nMarkers > 0);
     RNG rng((uint64)(randomSeed));
 
     Dictionary out = Dictionary(Mat(), markerSize);
@@ -370,7 +371,7 @@ Dictionary extendDictionary(int nMarkers, int markerSize, const Dictionary &base
     // if baseDictionary is provided, calculate its intermarker distance
     if(baseDictionary.bytesList.rows > 0) {
         CV_Assert(baseDictionary.markerSize == markerSize);
-        out.bytesList = baseDictionary.bytesList.clone();
+        out.bytesList = baseDictionary.bytesList.rowRange(0, min(nMarkers, baseDictionary.bytesList.rows)).clone();
 
         int minDistance = markerSize * markerSize + 1;
         for(int i = 0; i < out.bytesList.rows; i++) {
diff --git a/modules/objdetect/src/qrcode.cpp b/modules/objdetect/src/qrcode.cpp
index 0e32a2dea3..5e424fca9c 100644
--- a/modules/objdetect/src/qrcode.cpp
+++ b/modules/objdetect/src/qrcode.cpp
@@ -68,19 +68,14 @@ static void updatePointsResult(OutputArray points_, const vector<Point2f>& point
 
 static Point2f intersectionLines(Point2f a1, Point2f a2, Point2f b1, Point2f b2)
 {
+    // Try to solve a two lines intersection (a1, a2) and (b1, b2) as a system of equations:
+    // a2 + u * (a1 - a2) = b2 + v * (b1 - b2)
     const float divisor = (a1.x - a2.x) * (b1.y - b2.y) - (a1.y - a2.y) * (b1.x - b2.x);
     const float eps = 0.001f;
     if (abs(divisor) < eps)
         return a2;
-    Point2f result_square_angle(
-                              ((a1.x * a2.y  -  a1.y * a2.x) * (b1.x - b2.x) -
-                               (b1.x * b2.y  -  b1.y * b2.x) * (a1.x - a2.x)) /
-                               divisor,
-                              ((a1.x * a2.y  -  a1.y * a2.x) * (b1.y - b2.y) -
-                               (b1.x * b2.y  -  b1.y * b2.x) * (a1.y - a2.y)) /
-                               divisor
-                              );
-    return result_square_angle;
+    const float u = ((b2.x - a2.x) * (b1.y - b2.y) + (b1.x - b2.x) * (a2.y - b2.y)) / divisor;
+    return a2 + u * (a1 - a2);
 }
 
 //      / | b
@@ -1254,14 +1249,14 @@ bool QRDecode::computeSidesPoints(const vector<Point> &result_integer_hull)
         {
             if (points.front().x > points.back().x)
             {
-                reverse(points.begin(), points.end());
+                std::reverse(points.begin(), points.end());
             }
         }
         else
         {
             if (points.front().y > points.back().y)
             {
-                reverse(points.begin(), points.end());
+                std::reverse(points.begin(), points.end());
             }
         }
         if (points.empty())
@@ -1637,7 +1632,7 @@ bool QRDecode::findPatternsVerticesPoints(vector<vector<Point> > &patterns_verti
             }
             if ((int)min_angle_pnts_indexes.size() == num_vertices) { break; }
         }
-        sort(min_angle_pnts_indexes.begin(), min_angle_pnts_indexes.end());
+        std::sort(min_angle_pnts_indexes.begin(), min_angle_pnts_indexes.end());
 
         vector<Point> contour_vertices_points;
 
@@ -1766,11 +1761,11 @@ bool QRDecode::findTempPatternsAddingPoints(vector<std::pair<int, vector<Point>
             }
             if (abs(p1.x - p2.x) > abs(p1.y - p2.y))
             {
-                sort(points.begin(), points.end(), sortPointsByX());
+                std::sort(points.begin(), points.end(), sortPointsByX());
             }
             else
             {
-                sort(points.begin(), points.end(), sortPointsByY());
+                std::sort(points.begin(), points.end(), sortPointsByY());
             }
 
             temp_patterns_add_points.push_back(std::pair<int, vector<Point> >(idx_curved_side,points));
@@ -1914,11 +1909,11 @@ void QRDecode::completeAndSortSides()
         Point p2 = it->second.back();
         if (abs(p1.x - p2.x) > abs(p1.y - p2.y))
         {
-            sort(it->second.begin(), it->second.end(), sortPointsByX());
+            std::sort(it->second.begin(), it->second.end(), sortPointsByX());
         }
         else
         {
-            sort(it->second.begin(), it->second.end(), sortPointsByY());
+            std::sort(it->second.begin(), it->second.end(), sortPointsByY());
         }
     }
 }
@@ -2080,8 +2075,8 @@ bool QRDecode::divideIntoEvenSegments(vector<vector<Point2f> > &segments_points)
                 Point2f segment_start = segments_points[i][j];
                 Point2f segment_end   = segments_points[i][j + 1];
                 vector<Point2f>::iterator it_start, it_end, it;
-                it_start = find(spline_lines[i].begin(), spline_lines[i].end(), segment_start);
-                it_end   = find(spline_lines[i].begin(), spline_lines[i].end(), segment_end);
+                it_start = std::find(spline_lines[i].begin(), spline_lines[i].end(), segment_start);
+                it_end   = std::find(spline_lines[i].begin(), spline_lines[i].end(), segment_end);
                 float max_dist_to_line = 0.0;
                 for (it = it_start; it != it_end; it++)
                 {
diff --git a/modules/objdetect/test/test_boarddetection.cpp b/modules/objdetect/test/test_boarddetection.cpp
index e47e6c3cb6..0c99e6de61 100644
--- a/modules/objdetect/test/test_boarddetection.cpp
+++ b/modules/objdetect/test/test_boarddetection.cpp
@@ -318,4 +318,12 @@ TEST(CV_ArucoGenerateBoard, regression_1226) {
     });
 }
 
+TEST(CV_ArucoDictionary, extendDictionary) {
+    aruco::Dictionary base_dictionary = aruco::getPredefinedDictionary(aruco::DICT_4X4_250);
+    aruco::Dictionary custom_dictionary = aruco::extendDictionary(150, 4, base_dictionary);
+
+    ASSERT_EQ(custom_dictionary.bytesList.rows, 150);
+    ASSERT_EQ(cv::norm(custom_dictionary.bytesList, base_dictionary.bytesList.rowRange(0, 150)), 0.);
+}
+
 }} // namespace
diff --git a/modules/objdetect/test/test_cascadeandhog.cpp b/modules/objdetect/test/test_cascadeandhog.cpp
index 4151b899e3..0a68bd9bb3 100644
--- a/modules/objdetect/test/test_cascadeandhog.cpp
+++ b/modules/objdetect/test/test_cascadeandhog.cpp
@@ -355,7 +355,7 @@ int CV_DetectorTest::validate( int detectorIdx, vector<vector<Rect> >& objects )
                     map[minIdx] = 1;
             }
         }
-        noPair += (int)count_if( map.begin(), map.end(), isZero );
+        noPair += (int)std::count_if( map.begin(), map.end(), isZero );
         totalNoPair += noPair;
 
         /*if( noPair > cvRound(valRects.size()*eps.noPair)+1 )
diff --git a/modules/objdetect/test/test_qrcode_encode.cpp b/modules/objdetect/test/test_qrcode_encode.cpp
index 14900c3078..1005793269 100644
--- a/modules/objdetect/test/test_qrcode_encode.cpp
+++ b/modules/objdetect/test/test_qrcode_encode.cpp
@@ -264,7 +264,8 @@ TEST(Objdetect_QRCode_Encode_Decode, regression)
                 int true_capacity = establishCapacity(mode, version, cur_capacity);
 
                 std::string input_info = symbol_set;
-                std::random_shuffle(input_info.begin(),input_info.end());
+                std::mt19937 rand_gen {1};
+                std::shuffle(input_info.begin(), input_info.end(), rand_gen);
                 int count = 0;
                 if((int)input_info.length() > true_capacity)
                 {
@@ -390,15 +391,8 @@ TEST(Objdetect_QRCode_Encode_Decode_Structured_Append, DISABLED_regression)
         std::string symbol_set = config["symbols_set"];
 
         std::string input_info = symbol_set;
-#if defined CV_CXX11
-        // std::random_shuffle is deprecated since C++11 and removed in C++17.
-        // Use manually constructed RNG with a fixed seed and std::shuffle instead.
         std::mt19937 rand_gen {1};
         std::shuffle(input_info.begin(), input_info.end(), rand_gen);
-#else
-        SeededRandFunctor<1> rand_gen;
-        std::random_shuffle(input_info.begin(), input_info.end(), rand_gen);
-#endif
         for (int j = min_stuctures_num; j < max_stuctures_num; j++)
         {
             QRCodeEncoder::Params params;
diff --git a/modules/python/test/test_cuda.py b/modules/python/test/test_cuda.py
index 851a23e880..c886342832 100644
--- a/modules/python/test/test_cuda.py
+++ b/modules/python/test/test_cuda.py
@@ -70,6 +70,74 @@ class cuda_test(NewOpenCVTests):
         self.assertTrue(cuMat.step == 0)
         self.assertTrue(cuMat.size() == (0, 0))
 
+    def test_cuda_convertTo(self):
+        # setup
+        npMat_8UC4 = (np.random.random((128, 128, 4)) * 255).astype(np.uint8)
+        npMat_32FC4 = npMat_8UC4.astype(np.single)
+        new_type = cv.CV_32FC4
+
+        # sync
+        # in/out
+        cuMat_8UC4 = cv.cuda_GpuMat(npMat_8UC4)
+        cuMat_32FC4 = cv.cuda_GpuMat(cuMat_8UC4.size(), new_type)
+        cuMat_32FC4_out = cuMat_8UC4.convertTo(new_type, cuMat_32FC4)
+        self.assertTrue(cuMat_32FC4.cudaPtr() == cuMat_32FC4_out.cudaPtr())
+        npMat_32FC4_out = cuMat_32FC4.download()
+        self.assertTrue(np.array_equal(npMat_32FC4, npMat_32FC4_out))
+        # out
+        cuMat_32FC4_out = cuMat_8UC4.convertTo(new_type)
+        npMat_32FC4_out = cuMat_32FC4.download()
+        self.assertTrue(np.array_equal(npMat_32FC4, npMat_32FC4_out))
+
+        # async
+        stream = cv.cuda.Stream()
+        cuMat_32FC4 = cv.cuda_GpuMat(cuMat_8UC4.size(), new_type)
+        cuMat_32FC4_out = cuMat_8UC4.convertTo(new_type, cuMat_32FC4)
+        # in/out
+        cuMat_32FC4_out = cuMat_8UC4.convertTo(new_type, 1, 0, stream, cuMat_32FC4)
+        self.assertTrue(cuMat_32FC4.cudaPtr() == cuMat_32FC4_out.cudaPtr())
+        npMat_32FC4_out = cuMat_32FC4.download(stream)
+        stream.waitForCompletion()
+        self.assertTrue(np.array_equal(npMat_32FC4, npMat_32FC4_out))
+        # out
+        cuMat_32FC4_out = cuMat_8UC4.convertTo(new_type, 1, 0, stream)
+        npMat_32FC4_out = cuMat_32FC4.download(stream)
+        stream.waitForCompletion()
+        self.assertTrue(np.array_equal(npMat_32FC4, npMat_32FC4_out))
+
+    def test_cuda_copyTo(self):
+        # setup
+        npMat_8UC4 = (np.random.random((128, 128, 4)) * 255).astype(np.uint8)
+
+        # sync
+        # in/out
+        cuMat_8UC4 = cv.cuda_GpuMat(npMat_8UC4)
+        cuMat_8UC4_dst = cv.cuda_GpuMat(cuMat_8UC4.size(), cuMat_8UC4.type())
+        cuMat_8UC4_out = cuMat_8UC4.copyTo(cuMat_8UC4_dst)
+        self.assertTrue(cuMat_8UC4_out.cudaPtr() == cuMat_8UC4_dst.cudaPtr())
+        npMat_8UC4_out = cuMat_8UC4_out.download()
+        self.assertTrue(np.array_equal(npMat_8UC4, npMat_8UC4_out))
+        # out
+        cuMat_8UC4_out =  cuMat_8UC4.copyTo()
+        npMat_8UC4_out = cuMat_8UC4_out.download()
+        self.assertTrue(np.array_equal(npMat_8UC4, npMat_8UC4_out))
+
+        # async
+        stream = cv.cuda.Stream()
+        # in/out
+        cuMat_8UC4 = cv.cuda_GpuMat(npMat_8UC4)
+        cuMat_8UC4_dst = cv.cuda_GpuMat(cuMat_8UC4.size(), cuMat_8UC4.type())
+        cuMat_8UC4_out = cuMat_8UC4.copyTo(cuMat_8UC4_dst, stream)
+        self.assertTrue(cuMat_8UC4_out.cudaPtr() == cuMat_8UC4_out.cudaPtr())
+        npMat_8UC4_out = cuMat_8UC4_dst.download(stream)
+        stream.waitForCompletion()
+        self.assertTrue(np.array_equal(npMat_8UC4, npMat_8UC4_out))
+        # out
+        cuMat_8UC4_out = cuMat_8UC4.copyTo(stream)
+        npMat_8UC4_out = cuMat_8UC4_out.download(stream)
+        stream.waitForCompletion()
+        self.assertTrue(np.array_equal(npMat_8UC4, npMat_8UC4_out))
+
     def test_cuda_denoising(self):
         self.assertEqual(True, hasattr(cv.cuda, 'fastNlMeansDenoising'))
         self.assertEqual(True, hasattr(cv.cuda, 'fastNlMeansDenoisingColored'))
diff --git a/modules/stereo/test/test_stereomatching.cpp b/modules/stereo/test/test_stereomatching.cpp
index 02d1823d2d..c17d92292a 100644
--- a/modules/stereo/test/test_stereomatching.cpp
+++ b/modules/stereo/test/test_stereomatching.cpp
@@ -740,8 +740,8 @@ public:
     CV_StereoBMTest()
     {
         name = "stereobm";
-        fill(rmsEps.begin(), rmsEps.end(), 0.4f);
-        fill(fracEps.begin(), fracEps.end(), 0.022f);
+        std::fill(rmsEps.begin(), rmsEps.end(), 0.4f);
+        std::fill(fracEps.begin(), fracEps.end(), 0.022f);
     }
 
 protected:
@@ -866,8 +866,8 @@ public:
     CV_StereoSGBMTest()
     {
         name = "stereosgbm";
-        fill(rmsEps.begin(), rmsEps.end(), 0.25f);
-        fill(fracEps.begin(), fracEps.end(), 0.01f);
+        std::fill(rmsEps.begin(), rmsEps.end(), 0.25f);
+        std::fill(fracEps.begin(), fracEps.end(), 0.01f);
     }
 
 protected:
diff --git a/modules/videoio/include/opencv2/videoio.hpp b/modules/videoio/include/opencv2/videoio.hpp
index 442fa08ec5..7bc3934891 100644
--- a/modules/videoio/include/opencv2/videoio.hpp
+++ b/modules/videoio/include/opencv2/videoio.hpp
@@ -134,7 +134,7 @@ enum VideoCaptureAPIs {
 */
 enum VideoCaptureProperties {
        CAP_PROP_POS_MSEC       =0, //!< Current position of the video file in milliseconds.
-       CAP_PROP_POS_FRAMES     =1, //!< 0-based index of the frame to be decoded/captured next.
+       CAP_PROP_POS_FRAMES     =1, //!< 0-based index of the frame to be decoded/captured next. When the index i is set in RAW mode (CAP_PROP_FORMAT == -1) this will seek to the key frame k, where k <= i.
        CAP_PROP_POS_AVI_RATIO  =2, //!< Relative position of the video file: 0=start of the film, 1=end of the film.
        CAP_PROP_FRAME_WIDTH    =3, //!< Width of the frames in the video stream.
        CAP_PROP_FRAME_HEIGHT   =4, //!< Height of the frames in the video stream.
@@ -1030,6 +1030,9 @@ public:
     - Most codecs are lossy. If you want lossless video file you need to use a lossless codecs
       (eg. FFMPEG FFV1, Huffman HFYU, Lagarith LAGS, etc...)
     - If FFMPEG is enabled, using `codec=0; fps=0;` you can create an uncompressed (raw) video file.
+    - If FFMPEG is used, we allow frames of odd width or height, but in this case we truncate
+      the rightmost column/the bottom row. Probably, this should be handled more elegantly,
+      but some internal functions inside FFMPEG swscale require even width/height.
     */
     CV_WRAP VideoWriter(const String& filename, int fourcc, double fps,
                 Size frameSize, bool isColor = true);
diff --git a/modules/videoio/src/cap_dshow.cpp b/modules/videoio/src/cap_dshow.cpp
index d6b2b95545..21af06a147 100644
--- a/modules/videoio/src/cap_dshow.cpp
+++ b/modules/videoio/src/cap_dshow.cpp
@@ -2771,7 +2771,7 @@ int videoInput::start(int deviceID, videoDevice *VD){
     if(customSize){
         DebugPrintOut("SETUP: Default Format is set to %ix%i\n", currentWidth, currentHeight);
 
-        if (strcmp("OBS Virtual Camera", VD->nDeviceName) == 0)
+        if (strcmp("OBS Virtual Camera", VD->nDeviceName) == 0 || strcmp("Streamlabs Desktop Virtual Webcam", VD->nDeviceName) == 0)
         {
             // OBS Virtual Camera always returns S_OK on SetFormat(), even if it doesn't support
             // the actual format. So we have to choose a format that it supports manually, e.g. NV12.
diff --git a/modules/videoio/src/cap_ffmpeg_impl.hpp b/modules/videoio/src/cap_ffmpeg_impl.hpp
index 982bc5c87d..e4431b323e 100644
--- a/modules/videoio/src/cap_ffmpeg_impl.hpp
+++ b/modules/videoio/src/cap_ffmpeg_impl.hpp
@@ -580,6 +580,7 @@ struct CvCapture_FFMPEG
     bool processRawPacket();
     bool rawMode;
     bool rawModeInitialized;
+    bool rawSeek;
     bool convertRGB;
     AVPacket packet_filtered;
 #if LIBAVFORMAT_BUILD >= CALC_FFMPEG_VERSION(58, 20, 100)
@@ -633,6 +634,7 @@ void CvCapture_FFMPEG::init()
 
     rawMode = false;
     rawModeInitialized = false;
+    rawSeek = false;
     convertRGB = true;
     memset(&packet_filtered, 0, sizeof(packet_filtered));
     av_init_packet(&packet_filtered);
@@ -1051,33 +1053,35 @@ bool CvCapture_FFMPEG::open(const char* _filename, const VideoCaptureParameters&
                 return false;
             }
         }
-        if (params.has(CAP_PROP_HW_ACCELERATION))
-        {
-            va_type = params.get<VideoAccelerationType>(CAP_PROP_HW_ACCELERATION);
+        if(!rawMode) {
+            if (params.has(CAP_PROP_HW_ACCELERATION))
+            {
+                va_type = params.get<VideoAccelerationType>(CAP_PROP_HW_ACCELERATION);
 #if !USE_AV_HW_CODECS
-            if (va_type != VIDEO_ACCELERATION_NONE && va_type != VIDEO_ACCELERATION_ANY)
-            {
-                CV_LOG_ERROR(NULL, "VIDEOIO/FFMPEG: FFmpeg backend is build without acceleration support. Can't handle CAP_PROP_HW_ACCELERATION parameter. Bailout");
-                return false;
-            }
+                if (va_type != VIDEO_ACCELERATION_NONE && va_type != VIDEO_ACCELERATION_ANY)
+                {
+                    CV_LOG_ERROR(NULL, "VIDEOIO/FFMPEG: FFmpeg backend is build without acceleration support. Can't handle CAP_PROP_HW_ACCELERATION parameter. Bailout");
+                    return false;
+                }
 #endif
-        }
-        if (params.has(CAP_PROP_HW_DEVICE))
-        {
-            hw_device = params.get<int>(CAP_PROP_HW_DEVICE);
-            if (va_type == VIDEO_ACCELERATION_NONE && hw_device != -1)
-            {
-                CV_LOG_ERROR(NULL, "VIDEOIO/FFMPEG: Invalid usage of CAP_PROP_HW_DEVICE without requested H/W acceleration. Bailout");
-                return false;
             }
-            if (va_type == VIDEO_ACCELERATION_ANY && hw_device != -1)
+            if (params.has(CAP_PROP_HW_DEVICE))
             {
-                CV_LOG_ERROR(NULL, "VIDEOIO/FFMPEG: Invalid usage of CAP_PROP_HW_DEVICE with 'ANY' H/W acceleration. Bailout");
-                return false;
+                hw_device = params.get<int>(CAP_PROP_HW_DEVICE);
+                if (va_type == VIDEO_ACCELERATION_NONE && hw_device != -1)
+                {
+                    CV_LOG_ERROR(NULL, "VIDEOIO/FFMPEG: Invalid usage of CAP_PROP_HW_DEVICE without requested H/W acceleration. Bailout");
+                    return false;
+                }
+                if (va_type == VIDEO_ACCELERATION_ANY && hw_device != -1)
+                {
+                    CV_LOG_ERROR(NULL, "VIDEOIO/FFMPEG: Invalid usage of CAP_PROP_HW_DEVICE with 'ANY' H/W acceleration. Bailout");
+                    return false;
+                }
+            }
+            if (params.has(CAP_PROP_HW_ACCELERATION_USE_OPENCL)) {
+                use_opencl = params.get<int>(CAP_PROP_HW_ACCELERATION_USE_OPENCL);
             }
-        }
-        if (params.has(CAP_PROP_HW_ACCELERATION_USE_OPENCL)) {
-            use_opencl = params.get<int>(CAP_PROP_HW_ACCELERATION_USE_OPENCL);
         }
 #if USE_AV_INTERRUPT_CALLBACK
         if (params.has(CAP_PROP_OPEN_TIMEOUT_MSEC))
@@ -1153,6 +1157,23 @@ bool CvCapture_FFMPEG::open(const char* _filename, const VideoCaptureParameters&
         CV_LOG_WARNING(NULL, "Unable to read codec parameters from stream (" << _opencv_ffmpeg_get_error_string(err) << ")");
         goto exit_func;
     }
+
+    if (rawMode) {
+        video_stream = av_find_best_stream(ic, AVMEDIA_TYPE_VIDEO, -1, -1, NULL, 0);
+        if (video_stream < 0) {
+            close();
+            return false;
+        }
+        video_st = ic->streams[video_stream];
+#ifndef CV_FFMPEG_CODECPAR
+        frame.height = video_st->codec->height;
+        frame.width = video_st->codec->width;
+#else
+        frame.height = video_st->codecpar->height;
+        frame.width = video_st->codecpar->width;
+#endif
+        return true;
+    }
     for(i = 0; i < ic->nb_streams; i++)
     {
 #ifndef CV_FFMPEG_CODECPAR
@@ -1440,6 +1461,10 @@ bool CvCapture_FFMPEG::processRawPacket()
 
 bool CvCapture_FFMPEG::grabFrame()
 {
+    if (rawSeek) {
+        rawSeek = false;
+        return true;
+    }
     bool valid = false;
 
     static const size_t max_read_attempts = cv::utils::getConfigurationParameterSizeT("OPENCV_FFMPEG_READ_ATTEMPTS", 4096);
@@ -1447,7 +1472,7 @@ bool CvCapture_FFMPEG::grabFrame()
     size_t cur_read_attempts = 0;
     size_t cur_decode_attempts = 0;
 
-    if( !ic || !video_st || !context )  return false;
+    if( !ic || !video_st || (!rawMode && !context) )  return false;
 
     if( ic->streams[video_stream]->nb_frames > 0 &&
         frame_number > ic->streams[video_stream]->nb_frames )
@@ -1464,7 +1489,7 @@ bool CvCapture_FFMPEG::grabFrame()
 
 #if USE_AV_SEND_FRAME_API
     // check if we can receive frame from previously decoded packet
-    valid = avcodec_receive_frame(context, picture) >= 0;
+    valid = rawMode ? false : avcodec_receive_frame(context, picture) >= 0;
 #endif
 
     // get the next frame
@@ -1548,12 +1573,16 @@ bool CvCapture_FFMPEG::grabFrame()
     }
 
     if (valid) {
-        if( picture_pts == AV_NOPTS_VALUE_ )
-            picture_pts = picture->CV_FFMPEG_PTS_FIELD != AV_NOPTS_VALUE_ && picture->CV_FFMPEG_PTS_FIELD != 0 ? picture->CV_FFMPEG_PTS_FIELD : picture->pkt_dts;
-        frame_number++;
+        if (picture_pts == AV_NOPTS_VALUE_) {
+            if (!rawMode)
+                picture_pts = picture->CV_FFMPEG_PTS_FIELD != AV_NOPTS_VALUE_ && picture->CV_FFMPEG_PTS_FIELD != 0 ? picture->CV_FFMPEG_PTS_FIELD : picture->pkt_dts;
+            else
+                picture_pts = packet.pts != AV_NOPTS_VALUE_ && packet.pts != 0 ? packet.pts : packet.dts;
+            frame_number++;
+        }
     }
 
-    if (!rawMode && valid && first_frame_number < 0)
+    if (valid && first_frame_number < 0)
         first_frame_number = dts_to_frame_number(picture_pts);
 
 #if USE_AV_INTERRUPT_CALLBACK
@@ -1567,7 +1596,7 @@ bool CvCapture_FFMPEG::grabFrame()
 
 bool CvCapture_FFMPEG::retrieveFrame(int flag, unsigned char** data, int* step, int* width, int* height, int* cn, int* depth)
 {
-    if (!video_st || !context)
+    if (!video_st || (!rawMode && !context))
         return false;
 
     if (rawMode || flag == extraDataIdx)
@@ -1735,7 +1764,7 @@ static inline double getCodecIdFourcc(const AVCodecID codec_id)
 
 double CvCapture_FFMPEG::getProperty( int property_id ) const
 {
-    if( !video_st || !context ) return 0;
+    if( !video_st || (!rawMode && !context) ) return 0;
 
     switch( property_id )
     {
@@ -1814,7 +1843,8 @@ double CvCapture_FFMPEG::getProperty( int property_id ) const
         //ic->start_time_realtime is in microseconds
         return ((double)ic->start_time_realtime);
     case CAP_PROP_N_THREADS:
-        return static_cast<double>(context->thread_count);
+        if (!rawMode)
+            return static_cast<double>(context->thread_count);
     default:
         break;
     }
@@ -1910,9 +1940,11 @@ void CvCapture_FFMPEG::get_rotation_angle()
 
 void CvCapture_FFMPEG::seek(int64_t _frame_number)
 {
-    CV_Assert(context);
+    if (!rawMode) {
+        CV_Assert(context);
+    }
     _frame_number = std::min(_frame_number, get_total_frames());
-    int delta = 16;
+    int delta = !rawMode ? 16 : 0;
 
     // if we have not grabbed a single frame before first seek, let's read the first frame
     // and get some valuable information during the process
@@ -1927,7 +1959,8 @@ void CvCapture_FFMPEG::seek(int64_t _frame_number)
         double  time_base  = r2d(ic->streams[video_stream]->time_base);
         time_stamp += (int64_t)(sec / time_base + 0.5);
         if (get_total_frames() > 1) av_seek_frame(ic, video_stream, time_stamp, AVSEEK_FLAG_BACKWARD);
-        avcodec_flush_buffers(context);
+        if(!rawMode)
+            avcodec_flush_buffers(context);
         if( _frame_number > 0 )
         {
             grabFrame();
@@ -1935,6 +1968,10 @@ void CvCapture_FFMPEG::seek(int64_t _frame_number)
             if( _frame_number > 1 )
             {
                 frame_number = dts_to_frame_number(picture_pts) - first_frame_number;
+                if (rawMode) {
+                    rawSeek = true;
+                    break;
+                }
                 //printf("_frame_number = %d, frame_number = %d, delta = %d\n",
                 //       (int)_frame_number, (int)frame_number, delta);
 
diff --git a/modules/videoio/src/cap_gstreamer.cpp b/modules/videoio/src/cap_gstreamer.cpp
index fc031d2b5f..305d527ce9 100644
--- a/modules/videoio/src/cap_gstreamer.cpp
+++ b/modules/videoio/src/cap_gstreamer.cpp
@@ -2825,8 +2825,6 @@ CvResult CV_API_CALL cv_capture_open_with_params(
     if (!handle)
         return CV_ERROR_FAIL;
     *handle = NULL;
-    if (!filename)
-        return CV_ERROR_FAIL;
     GStreamerCapture *cap = 0;
     try
     {
diff --git a/modules/videoio/src/cap_mjpeg_encoder.cpp b/modules/videoio/src/cap_mjpeg_encoder.cpp
index efac4093ae..2e7452cf17 100644
--- a/modules/videoio/src/cap_mjpeg_encoder.cpp
+++ b/modules/videoio/src/cap_mjpeg_encoder.cpp
@@ -268,7 +268,7 @@ public:
             m_buffer_list[0].finish();
 
             m_data_len = m_buffer_list[0].get_len();
-            m_last_bit_len = m_buffer_list[0].get_bits_free() ? 32 - m_buffer_list[0].get_bits_free() : 0;
+            m_last_bit_len = 32 - m_buffer_list[0].get_bits_free();
 
             return m_buffer_list[0].get_data();
         }
@@ -331,9 +331,14 @@ public:
         }
 
         //bits == 0 means that last element shouldn't be used.
-        m_output_buffer[m_data_len++] = currval;
-
-        m_last_bit_len = -bits;
+        if (bits != 0) {
+            m_output_buffer[m_data_len++] = currval;
+            m_last_bit_len = -bits;
+        }
+        else
+        {
+            m_last_bit_len = 32;
+        }
 
         return &m_output_buffer[0];
     }
@@ -1167,8 +1172,6 @@ public:
         fdct_qtab(_fdct_qtab),
         cat_table(_cat_table)
     {
-#if 0  // disable parallel processing due to buffer overrun bug: https://github.com/opencv/opencv/issues/19634
-
         //empirically found value. if number of pixels is less than that value there is no sense to parallelize it.
         const int min_pixels_count = 96*96;
 
@@ -1194,12 +1197,6 @@ public:
 
         stripes_count = std::min(stripes_count, max_stripes);
 
-#else
-        if (nstripes > 1)
-            CV_LOG_ONCE_WARNING(NULL, "VIDEOIO/MJPEG: parallel processing is disabled: https://github.com/opencv/opencv/issues/19634");
-        stripes_count = 1;
-#endif
-
         m_buffer_list.allocate_buffers(stripes_count, (height*width*2)/stripes_count);
     }
 
diff --git a/modules/videoio/src/cap_msmf.cpp b/modules/videoio/src/cap_msmf.cpp
index 78eefc34a3..4b234b8cae 100644
--- a/modules/videoio/src/cap_msmf.cpp
+++ b/modules/videoio/src/cap_msmf.cpp
@@ -1159,7 +1159,12 @@ bool CvCapture_MSMF::configureVideoOutput(MediaType newType, cv::uint32_t outFor
     {
         initStream(dwVideoStreamIndex, nativeFormat);
     }
-    return initStream(dwVideoStreamIndex, newFormat);
+    if (!initStream(dwVideoStreamIndex, newFormat))
+    {
+        return false;
+    }
+    outputVideoFormat = outFormat;
+    return true;
 }
 
 bool CvCapture_MSMF::configureOutput()
@@ -2719,8 +2724,6 @@ CvResult CV_API_CALL cv_capture_open_with_params(
     if (!handle)
         return CV_ERROR_FAIL;
     *handle = NULL;
-    if (!filename)
-        return CV_ERROR_FAIL;
     CaptureT* cap = 0;
     try
     {
diff --git a/modules/videoio/src/cap_v4l.cpp b/modules/videoio/src/cap_v4l.cpp
index 905c79e42f..5b282f1966 100644
--- a/modules/videoio/src/cap_v4l.cpp
+++ b/modules/videoio/src/cap_v4l.cpp
@@ -2155,6 +2155,7 @@ bool CvCaptureCAM_V4L::setProperty( int property_id, double _value )
         }else{
             convert_rgb = false;
             releaseFrame();
+            v4l2_create_frame();
             return true;
         }
     case cv::CAP_PROP_FOURCC:
diff --git a/modules/videoio/test/test_camera.cpp b/modules/videoio/test/test_camera.cpp
index fc269959c3..8b0f0efe83 100644
--- a/modules/videoio/test/test_camera.cpp
+++ b/modules/videoio/test/test_camera.cpp
@@ -119,6 +119,21 @@ TEST(DISABLED_videoio_camera, v4l_read_mjpg)
     capture.release();
 }
 
+TEST(DISABLED_videoio_camera, msmf_read_yuyv)
+{
+    VideoCapture capture(CAP_MSMF);
+    ASSERT_TRUE(capture.isOpened());
+    ASSERT_TRUE(capture.set(CAP_PROP_FOURCC, VideoWriter::fourcc('Y', 'U', 'Y', 'V')));
+    std::cout << "Camera 0 via " << capture.getBackendName() << " backend" << std::endl;
+    std::cout << "Frame width: " << capture.get(CAP_PROP_FRAME_WIDTH) << std::endl;
+    std::cout << "     height: " << capture.get(CAP_PROP_FRAME_HEIGHT) << std::endl;
+    std::cout << "Capturing FPS: " << capture.get(CAP_PROP_FPS) << std::endl;
+    int fourcc = (int)capture.get(CAP_PROP_FOURCC);
+    std::cout << "FOURCC code: " << cv::format("0x%8x", fourcc) << std::endl;
+    test_readFrames(capture);
+    capture.release();
+}
+
 TEST(DISABLED_videoio_camera, v4l_open_mjpg)
 {
     VideoCapture capture;
diff --git a/modules/videoio/test/test_ffmpeg.cpp b/modules/videoio/test/test_ffmpeg.cpp
index 35d425d5c1..0496b8c369 100644
--- a/modules/videoio/test/test_ffmpeg.cpp
+++ b/modules/videoio/test/test_ffmpeg.cpp
@@ -476,6 +476,16 @@ static void ffmpeg_check_read_raw(VideoCapture& cap)
     EXPECT_EQ(CV_8UC1, data.type()) << "CV_8UC1 != " << typeToString(data.type());
     EXPECT_TRUE(data.rows == 1 || data.cols == 1) << data.size;
     EXPECT_EQ((size_t)37118, data.total());
+
+#ifndef WIN32
+    // 12 is the nearset key frame to frame 18
+    EXPECT_TRUE(cap.set(CAP_PROP_POS_FRAMES, 18.));
+    EXPECT_EQ(cap.get(CAP_PROP_POS_FRAMES), 12.);
+    cap >> data;
+    EXPECT_EQ(CV_8UC1, data.type()) << "CV_8UC1 != " << typeToString(data.type());
+    EXPECT_TRUE(data.rows == 1 || data.cols == 1) << data.size;
+    EXPECT_EQ((size_t)8726, data.total());
+#endif
 }
 
 TEST(videoio_ffmpeg, ffmpeg_check_extra_data)
@@ -506,6 +516,16 @@ TEST(videoio_ffmpeg, open_with_property)
         CAP_PROP_FORMAT, -1  // demux only
     }));
 
+    // confirm properties are returned without initializing AVCodecContext
+    EXPECT_EQ(cap.get(CAP_PROP_FORMAT), -1);
+    EXPECT_EQ(static_cast<int>(cap.get(CAP_PROP_FOURCC)), fourccFromString("FMP4"));
+#ifndef WIN32
+    EXPECT_EQ(cap.get(CAP_PROP_N_THREADS), 0.0);
+#endif
+    EXPECT_EQ(cap.get(CAP_PROP_FRAME_HEIGHT), 384.0);
+    EXPECT_EQ(cap.get(CAP_PROP_FRAME_WIDTH), 672.0);
+    EXPECT_EQ(cap.get(CAP_PROP_FRAME_COUNT), 125);
+    EXPECT_EQ(cap.get(CAP_PROP_FPS), 24.0);
     ffmpeg_check_read_raw(cap);
 }
 
@@ -519,6 +539,16 @@ TEST(videoio_ffmpeg, create_with_property)
         CAP_PROP_FORMAT, -1  // demux only
     });
 
+    // confirm properties are returned without initializing AVCodecContext
+    EXPECT_TRUE(cap.get(CAP_PROP_FORMAT) == -1);
+    EXPECT_EQ(static_cast<int>(cap.get(CAP_PROP_FOURCC)), fourccFromString("FMP4"));
+#ifndef WIN32
+    EXPECT_EQ(cap.get(CAP_PROP_N_THREADS), 0.0);
+#endif
+    EXPECT_EQ(cap.get(CAP_PROP_FRAME_HEIGHT), 384.0);
+    EXPECT_EQ(cap.get(CAP_PROP_FRAME_WIDTH), 672.0);
+    EXPECT_EQ(cap.get(CAP_PROP_FRAME_COUNT), 125);
+    EXPECT_EQ(cap.get(CAP_PROP_FPS), 24.0);
     ffmpeg_check_read_raw(cap);
 }
 
diff --git a/platforms/js/opencv_js.config.py b/platforms/js/opencv_js.config.py
index 66add88456..12f6254801 100644
--- a/platforms/js/opencv_js.config.py
+++ b/platforms/js/opencv_js.config.py
@@ -9,6 +9,7 @@ core = {
         'perspectiveTransform', 'polarToCart', 'pow', 'randn', 'randu', 'reduce', 'repeat', 'rotate', 'setIdentity', 'setRNGSeed',
         'solve', 'solvePoly', 'split', 'sqrt', 'subtract', 'trace', 'transform', 'transpose', 'vconcat',
         'setLogLevel', 'getLogLevel',
+        'LUT',
     ],
     'Algorithm': [],
 }
diff --git a/samples/dnn/fast_neural_style.py b/samples/dnn/fast_neural_style.py
index 912c2f0832..43b8b121d6 100644
--- a/samples/dnn/fast_neural_style.py
+++ b/samples/dnn/fast_neural_style.py
@@ -5,15 +5,15 @@ import argparse
 
 parser = argparse.ArgumentParser(
         description='This script is used to run style transfer models from '
-                    'https://github.com/jcjohnson/fast-neural-style using OpenCV')
+                    'https://github.com/onnx/models/tree/main/vision/style_transfer/fast_neural_style using OpenCV')
 parser.add_argument('--input', help='Path to image or video. Skip to capture frames from camera')
-parser.add_argument('--model', help='Path to .t7 model')
+parser.add_argument('--model', help='Path to .onnx model')
 parser.add_argument('--width', default=-1, type=int, help='Resize input to specific width.')
 parser.add_argument('--height', default=-1, type=int, help='Resize input to specific height.')
 parser.add_argument('--median_filter', default=0, type=int, help='Kernel size of postprocessing blurring.')
 args = parser.parse_args()
 
-net = cv.dnn.readNetFromTorch(cv.samples.findFile(args.model))
+net = cv.dnn.readNet(cv.samples.findFile(args.model))
 net.setPreferableBackend(cv.dnn.DNN_BACKEND_OPENCV)
 
 if args.input:
@@ -31,16 +31,12 @@ while cv.waitKey(1) < 0:
     inWidth = args.width if args.width != -1 else frame.shape[1]
     inHeight = args.height if args.height != -1 else frame.shape[0]
     inp = cv.dnn.blobFromImage(frame, 1.0, (inWidth, inHeight),
-                              (103.939, 116.779, 123.68), swapRB=False, crop=False)
+                               swapRB=True, crop=False)
 
     net.setInput(inp)
     out = net.forward()
 
     out = out.reshape(3, out.shape[2], out.shape[3])
-    out[0] += 103.939
-    out[1] += 116.779
-    out[2] += 123.68
-    out /= 255
     out = out.transpose(1, 2, 0)
 
     t, _ = net.getPerfProfile()
@@ -50,4 +46,7 @@ while cv.waitKey(1) < 0:
     if args.median_filter:
         out = cv.medianBlur(out, args.median_filter)
 
+    out = np.clip(out, 0, 255)
+    out = out.astype(np.uint8)
+
     cv.imshow('Styled image', out)
diff --git a/samples/dnn/js_face_recognition.html b/samples/dnn/js_face_recognition.html
index e5a3669a4f..95254ecd3a 100644
--- a/samples/dnn/js_face_recognition.html
+++ b/samples/dnn/js_face_recognition.html
@@ -40,7 +40,7 @@ function detectFaces(img) {
 
 //! [Get 128 floating points feature vector]
 function face2vec(face) {
-  var blob = cv.blobFromImage(face, 1.0 / 255, {width: 96, height: 96}, [0, 0, 0, 0], true, false)
+  var blob = cv.blobFromImage(face, 1.0, {width: 112, height: 112}, [0, 0, 0, 0], true, false)
   netRecogn.setInput(blob);
   var vec = netRecogn.forward();
   blob.delete();
@@ -71,15 +71,15 @@ function loadModels(callback) {
   var utils = new Utils('');
   var proto = 'https://raw.githubusercontent.com/opencv/opencv/5.x/samples/dnn/face_detector/deploy_lowres.prototxt';
   var weights = 'https://raw.githubusercontent.com/opencv/opencv_3rdparty/dnn_samples_face_detector_20180205_fp16/res10_300x300_ssd_iter_140000_fp16.caffemodel';
-  var recognModel = 'https://raw.githubusercontent.com/pyannote/pyannote-data/master/openface.nn4.small2.v1.t7';
+  var recognModel =  'https://media.githubusercontent.com/media/opencv/opencv_zoo/main/models/face_recognition_sface/face_recognition_sface_2021dec.onnx';
   utils.createFileFromUrl('face_detector.prototxt', proto, () => {
     document.getElementById('status').innerHTML = 'Downloading face_detector.caffemodel';
     utils.createFileFromUrl('face_detector.caffemodel', weights, () => {
       document.getElementById('status').innerHTML = 'Downloading OpenFace model';
-      utils.createFileFromUrl('face_recognition.t7', recognModel, () => {
+      utils.createFileFromUrl('face_recognition_sface_2021dec.onnx', recognModel, () => {
         document.getElementById('status').innerHTML = '';
         netDet = cv.readNetFromCaffe('face_detector.prototxt', 'face_detector.caffemodel');
-        netRecogn = cv.readNetFromTorch('face_recognition.t7');
+        netRecogn = cv.readNet('face_recognition_sface_2021dec.onnx');
         callback();
       });
     });
@@ -121,8 +121,8 @@ function main() {
       persons[name] = face2vec(face).clone();
 
       var canvas = document.createElement("canvas");
-      canvas.setAttribute("width", 96);
-      canvas.setAttribute("height", 96);
+      canvas.setAttribute("width", 112);
+      canvas.setAttribute("height", 112);
       var cell = document.getElementById("targetImgs").insertCell(0);
       cell.appendChild(canvas);
 
diff --git a/samples/python/tst_scene_render.py b/samples/python/tst_scene_render.py
index 9d09ea7b9e..c3eb69ef9c 100644
--- a/samples/python/tst_scene_render.py
+++ b/samples/python/tst_scene_render.py
@@ -25,7 +25,7 @@ class TestSceneRender():
         if bgImg is not None:
             self.sceneBg = bgImg.copy()
         else:
-            self.sceneBg = np.zeros(defaultSize, defaultSize, np.uint8)
+            self.sceneBg = np.zeros((defaultSize, defaultSize,3), np.uint8)
 
         self.w = self.sceneBg.shape[0]
         self.h = self.sceneBg.shape[1]
@@ -85,7 +85,7 @@ class TestSceneRender():
             img[self.currentCenter[0]:self.currentCenter[0]+self.foreground.shape[0],
              self.currentCenter[1]:self.currentCenter[1]+self.foreground.shape[1]] = self.foreground
         else:
-            self.currentRect = self.initialRect + np.int( 30*cos(self.time*self.speed) + 50*sin(self.time*self.speed))
+            self.currentRect = self.initialRect + int( 30*cos(self.time*self.speed) + 50*sin(self.time*self.speed))
             if self.deformation:
                 self.currentRect[1:3] += int(self.h/20*cos(self.time))
             cv.fillConvexPoly(img, self.currentRect, (0, 0, 255))