diff --git a/3rdparty/fastcv/fastcv.cmake b/3rdparty/fastcv/fastcv.cmake
index 6fee4ce4ce..8a82995bf3 100644
--- a/3rdparty/fastcv/fastcv.cmake
+++ b/3rdparty/fastcv/fastcv.cmake
@@ -1,23 +1,23 @@
 function(download_fastcv root_dir)
 
   # Commit SHA in the opencv_3rdparty repo
-  set(FASTCV_COMMIT "8d86e68dad8b80b8575a8d3cf401d3ee96c24148")
+  set(FASTCV_COMMIT "abe340d0fb7f19fa9315080e3c8616642e98a296")
 
   # Define actual FastCV versions
   if(ANDROID)
     if(AARCH64)
       message(STATUS "Download FastCV for Android aarch64")
-      set(FCV_PACKAGE_NAME  "fastcv_android_aarch64_2025_04_08.tgz")
-      set(FCV_PACKAGE_HASH  "e028966a1d1b2f3f0bc5967d316e8b64")
+      set(FCV_PACKAGE_NAME  "fastcv_android_aarch64_2025_04_29.tgz")
+      set(FCV_PACKAGE_HASH  "d9172a9a3e5d92d080a4192cc5691001")
     else()
       message(STATUS "Download FastCV for Android armv7")
-      set(FCV_PACKAGE_NAME  "fastcv_android_arm32_2025_04_08.tgz")
-      set(FCV_PACKAGE_HASH  "6fc1e812a4b3ef392469d2283e037ffe")
+      set(FCV_PACKAGE_NAME  "fastcv_android_arm32_2025_04_29.tgz")
+      set(FCV_PACKAGE_HASH  "246b5253233391cd2c74d01d49aee9c3")
     endif()
   elseif(UNIX AND NOT APPLE AND NOT IOS AND NOT XROS)
     if(AARCH64)
-      set(FCV_PACKAGE_NAME  "fastcv_linux_aarch64_2025_04_08.tgz")
-      set(FCV_PACKAGE_HASH  "062a26639cd2788beee2e0dd8743d680")
+      set(FCV_PACKAGE_NAME  "fastcv_linux_aarch64_2025_04_29.tgz")
+      set(FCV_PACKAGE_HASH  "e2ce60e25c8e4113a7af2bd243118f4c")
     else()
       message("FastCV: fastcv lib for 32-bit Linux is not supported for now!")
     endif()
diff --git a/3rdparty/libtiff/tif_hash_set.c b/3rdparty/libtiff/tif_hash_set.c
index 9792c63f47..81dea3fcf2 100644
--- a/3rdparty/libtiff/tif_hash_set.c
+++ b/3rdparty/libtiff/tif_hash_set.c
@@ -146,7 +146,7 @@ TIFFHashSet *TIFFHashSetNew(TIFFHashSetHashFunc fnHashFunc,
     set->fnEqualFunc = fnEqualFunc ? fnEqualFunc : TIFFHashSetEqualPointer;
     set->fnFreeEltFunc = fnFreeEltFunc;
     set->nSize = 0;
-    set->tabList = (TIFFList **)(calloc(sizeof(TIFFList *), 53));
+    set->tabList = (TIFFList **)(calloc(53, sizeof(TIFFList *)));
     if (set->tabList == NULL)
     {
         free(set);
@@ -367,7 +367,7 @@ static bool TIFFHashSetRehash(TIFFHashSet *set)
 {
     int nNewAllocatedSize = anPrimes[set->nIndiceAllocatedSize];
     TIFFList **newTabList =
-        (TIFFList **)(calloc(sizeof(TIFFList *), nNewAllocatedSize));
+        (TIFFList **)(calloc(nNewAllocatedSize, sizeof(TIFFList *)));
     if (newTabList == NULL)
         return false;
 #ifdef HASH_DEBUG
diff --git a/3rdparty/openjpeg/openjp2/jp2.c b/3rdparty/openjpeg/openjp2/jp2.c
index 4df055a542..da5063186c 100644
--- a/3rdparty/openjpeg/openjp2/jp2.c
+++ b/3rdparty/openjpeg/openjp2/jp2.c
@@ -2873,7 +2873,7 @@ OPJ_BOOL opj_jp2_read_header(opj_stream_private_t *p_stream,
                               p_image,
                               p_manager);
 
-    if (p_image && *p_image) {
+    if (ret && p_image && *p_image) {
         /* Set Image Color Space */
         if (jp2->enumcs == 16) {
             (*p_image)->color_space = OPJ_CLRSPC_SRGB;
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 25b1322c3e..1b4da9ea90 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -920,9 +920,9 @@ if(WITH_NDSRVP)
 endif()
 
 if(WITH_HAL_RVV)
-  ocv_debug_message(STATUS "Enable HAL RVV acceleration")
-  if(NOT ";${OpenCV_HAL};" MATCHES ";halrvv;")
-    set(OpenCV_HAL "halrvv;${OpenCV_HAL}")
+  ocv_debug_message(STATUS "Enable RVV HAL acceleration")
+  if(NOT ";${OpenCV_HAL};" MATCHES ";rvvhal;")
+    set(OpenCV_HAL "rvvhal;${OpenCV_HAL}")
   endif()
 endif()
 
@@ -955,13 +955,13 @@ foreach(hal ${OpenCV_HAL})
     else()
       message(STATUS "NDSRVP: Andes GNU Toolchain DSP extension is not enabled, disabling ndsrvp...")
     endif()
-  elseif(hal STREQUAL "halrvv")
+  elseif(hal STREQUAL "rvvhal")
     if(";${CPU_BASELINE_FINAL};" MATCHES ";RVV;")
       add_subdirectory(hal/riscv-rvv)
       ocv_hal_register(RVV_HAL_LIBRARIES RVV_HAL_HEADERS RVV_HAL_INCLUDE_DIRS)
-      list(APPEND OpenCV_USED_HAL "HAL RVV (ver ${RVV_HAL_VERSION})")
+      list(APPEND OpenCV_USED_HAL "RVV HAL (ver ${RVV_HAL_VERSION})")
     else()
-      message(STATUS "HAL RVV: RVV is not available, disabling halrvv...")
+      message(STATUS "RVV HAL: RVV is not available, disabling RVV HAL...")
     endif()
   elseif(hal STREQUAL "ipp")
     add_subdirectory(hal/ipp)
diff --git a/cmake/OpenCVBindingsPreprocessorDefinitions.cmake b/cmake/OpenCVBindingsPreprocessorDefinitions.cmake
new file mode 100644
index 0000000000..2828e638a7
--- /dev/null
+++ b/cmake/OpenCVBindingsPreprocessorDefinitions.cmake
@@ -0,0 +1,63 @@
+function(ocv_bindings_generator_populate_preprocessor_definitions
+         opencv_modules
+         output_variable)
+  set(defs "\"CV_VERSION_MAJOR\": ${OPENCV_VERSION_MAJOR}")
+
+  macro(ocv_add_definition name value)
+    set(defs "${defs},\n\"${name}\": ${value}")
+  endmacro()
+
+  ocv_add_definition(CV_VERSION_MINOR ${OPENCV_VERSION_MINOR})
+  ocv_add_definition(CV_VERSION_PATCH ${OPENCV_VERSION_PATCH})
+  ocv_add_definition(OPENCV_ABI_COMPATIBILITY "${OPENCV_VERSION_MAJOR}00")
+
+  foreach(module IN LISTS ${opencv_modules})
+    if(HAVE_${module})
+        string(TOUPPER "${module}" module)
+        ocv_add_definition("HAVE_${module}" 1)
+    endif()
+  endforeach()
+  if(HAVE_EIGEN)
+    ocv_add_definition(HAVE_EIGEN 1)
+    ocv_add_definition(EIGEN_WORLD_VERSION ${EIGEN_WORLD_VERSION})
+    ocv_add_definition(EIGEN_MAJOR_VERSION ${EIGEN_MAJOR_VERSION})
+    ocv_add_definition(EIGEN_MINOR_VERSION ${EIGEN_MINOR_VERSION})
+  else()
+    # Some checks in parsed headers might not be protected with HAVE_EIGEN check
+    ocv_add_definition(EIGEN_WORLD_VERSION 0)
+    ocv_add_definition(EIGEN_MAJOR_VERSION 0)
+    ocv_add_definition(EIGEN_MINOR_VERSION 0)
+  endif()
+  if(HAVE_LAPACK)
+    ocv_add_definition(HAVE_LAPACK 1)
+  endif()
+
+  if(OPENCV_DISABLE_FILESYSTEM_SUPPORT)
+    ocv_add_definition(OPENCV_HAVE_FILESYSTEM_SUPPORT 0)
+  else()
+    ocv_add_definition(OPENCV_HAVE_FILESYSTEM_SUPPORT 1)
+  endif()
+
+  ocv_add_definition(OPENCV_BINDINGS_PARSER 1)
+
+  # Implementation details definitions, having no impact on how bindings are
+  # generated, so their real values can be safely ignored
+  ocv_add_definition(CV_ENABLE_UNROLLED 0)
+  ocv_add_definition(CV__EXCEPTION_PTR 0)
+  ocv_add_definition(CV_NEON 0)
+  ocv_add_definition(TBB_INTERFACE_VERSION 0)
+  ocv_add_definition(CV_SSE2 0)
+  ocv_add_definition(CV_VSX 0)
+  ocv_add_definition(OPENCV_SUPPORTS_FP_DENORMALS_HINT 0)
+  ocv_add_definition(CV_LOG_STRIP_LEVEL 0)
+  ocv_add_definition(CV_LOG_LEVEL_SILENT 0)
+  ocv_add_definition(CV_LOG_LEVEL_FATAL 1)
+  ocv_add_definition(CV_LOG_LEVEL_ERROR 2)
+  ocv_add_definition(CV_LOG_LEVEL_WARN 3)
+  ocv_add_definition(CV_LOG_LEVEL_INFO 4)
+  ocv_add_definition(CV_LOG_LEVEL_DEBUG 5)
+  ocv_add_definition(CV_LOG_LEVEL_VERBOSE 6)
+  ocv_add_definition(CERES_FOUND 0)
+
+  set(${output_variable} ${defs} PARENT_SCOPE)
+endfunction()
diff --git a/cmake/OpenCVFindLibsGrfmt.cmake b/cmake/OpenCVFindLibsGrfmt.cmake
index 60eeb99e14..6b1c9bdfaf 100644
--- a/cmake/OpenCVFindLibsGrfmt.cmake
+++ b/cmake/OpenCVFindLibsGrfmt.cmake
@@ -297,6 +297,9 @@ if(WITH_SPNG)
     else()
       if(PkgConfig_FOUND)
         pkg_check_modules(SPNG QUIET spng)
+        if(SPNG_FOUND)
+          set(SPNG_LIBRARY ${SPNG_LIBRARIES} CACHE INTERNAL "")
+        endif()
       endif()
     endif()
     if(SPNG_FOUND)
diff --git a/cmake/OpenCVFindLibsPerf.cmake b/cmake/OpenCVFindLibsPerf.cmake
index 760b4c3287..b302c67771 100644
--- a/cmake/OpenCVFindLibsPerf.cmake
+++ b/cmake/OpenCVFindLibsPerf.cmake
@@ -197,11 +197,13 @@ if(WITH_FASTCV)
       ocv_install_3rdparty_licenses(FastCV "${OpenCV_BINARY_DIR}/3rdparty/fastcv/LICENSE")
       add_library(fastcv STATIC IMPORTED)
       set_target_properties(fastcv PROPERTIES
-          IMPORTED_LINK_INTERFACE_LIBRARIES ""
+          IMPORTED_LINK_INTERFACE_LIBRARIES "dl"
           IMPORTED_LOCATION "${FastCV_LIB_PATH}/libfastcv.a"
       )
       if (NOT BUILD_SHARED_LIBS)
         install(FILES "${FastCV_LIB_PATH}/libfastcv.a" DESTINATION "${OPENCV_3P_LIB_INSTALL_PATH}" COMPONENT "dev")
+        set(FASTCV_LOCATION_PATH "${FastCV_LIB_PATH}/libfastcv.a" CACHE INTERNAL "" FORCE)
+        set(FASTCV_INSTALL_PATH "${CMAKE_INSTALL_PREFIX}/${OPENCV_3P_LIB_INSTALL_PATH}/libfastcv.a" CACHE INTERNAL "" FORCE)
       endif()
       set(FASTCV_LIBRARY "fastcv" CACHE PATH "FastCV library")
       list(APPEND OPENCV_LINKER_LIBS ${FASTCV_LIBRARY})
diff --git a/cmake/OpenCVGenConfig.cmake b/cmake/OpenCVGenConfig.cmake
index df48ae0848..c5b7ab13df 100644
--- a/cmake/OpenCVGenConfig.cmake
+++ b/cmake/OpenCVGenConfig.cmake
@@ -68,6 +68,14 @@ else()
   set(USE_IPPIW FALSE)
 endif()
 
+if(TARGET fastcv AND NOT BUILD_SHARED_LIBS)
+  file(RELATIVE_PATH FASTCV_INSTALL_PATH_RELATIVE_CONFIGCMAKE "${CMAKE_BINARY_DIR}" "${FASTCV_LOCATION_PATH}")
+  ocv_cmake_configure("${CMAKE_CURRENT_LIST_DIR}/templates/OpenCVConfig-FastCV.cmake.in" FASTCV_CONFIGCMAKE @ONLY)
+  set(USE_FASTCV TRUE)
+else()
+  set(USE_FASTCV FALSE)
+endif()
+
 ocv_cmake_hook(PRE_CMAKE_CONFIG_BUILD)
 configure_file("${OpenCV_SOURCE_DIR}/cmake/templates/OpenCVConfig.cmake.in" "${CMAKE_BINARY_DIR}/OpenCVConfig.cmake" @ONLY)
 #support for version checking when finding opencv. find_package(OpenCV 2.3.1 EXACT) should now work.
@@ -92,6 +100,11 @@ if(USE_IPPIW)
   ocv_cmake_configure("${CMAKE_CURRENT_LIST_DIR}/templates/OpenCVConfig-IPPIW.cmake.in" IPPIW_CONFIGCMAKE @ONLY)
 endif()
 
+if(USE_FASTCV)
+  file(RELATIVE_PATH FASTCV_INSTALL_PATH_RELATIVE_CONFIGCMAKE "${CMAKE_INSTALL_PREFIX}" "${FASTCV_INSTALL_PATH}")
+  ocv_cmake_configure("${CMAKE_CURRENT_LIST_DIR}/templates/OpenCVConfig-FastCV.cmake.in" FASTCV_CONFIGCMAKE @ONLY)
+endif()
+
 function(ocv_gen_config TMP_DIR NESTED_PATH ROOT_NAME)
   ocv_path_join(__install_nested "${OPENCV_CONFIG_INSTALL_PATH}" "${NESTED_PATH}")
   ocv_path_join(__tmp_nested "${TMP_DIR}" "${NESTED_PATH}")
diff --git a/cmake/OpenCVPackaging.cmake b/cmake/OpenCVPackaging.cmake
index e90aabb893..e97f40e1ae 100644
--- a/cmake/OpenCVPackaging.cmake
+++ b/cmake/OpenCVPackaging.cmake
@@ -12,7 +12,7 @@ if(NOT OPENCV_CUSTOM_PACKAGE_INFO)
 "OpenCV (Open Source Computer Vision Library) is an open source computer vision
 and machine learning software library. OpenCV was built to provide a common
 infrastructure for computer vision applications and to accelerate the use of
-machine perception in the commercial products. Being a BSD-licensed product,
+machine perception in the commercial products. Being a Apache 2.0 -licensed product,
 OpenCV makes it easy for businesses to utilize and modify the code.")
   set(CPACK_PACKAGE_VENDOR "OpenCV Foundation")
   set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE")
diff --git a/cmake/templates/OpenCVConfig-FastCV.cmake.in b/cmake/templates/OpenCVConfig-FastCV.cmake.in
new file mode 100644
index 0000000000..9ce3d5ef33
--- /dev/null
+++ b/cmake/templates/OpenCVConfig-FastCV.cmake.in
@@ -0,0 +1,7 @@
+if(NOT TARGET fastcv)
+  add_library(fastcv STATIC IMPORTED)
+  set_target_properties(fastcv PROPERTIES
+    IMPORTED_LINK_INTERFACE_LIBRARIES ""
+    IMPORTED_LOCATION "${OpenCV_INSTALL_PATH}/@FASTCV_INSTALL_PATH_RELATIVE_CONFIGCMAKE@"
+  )
+endif()
diff --git a/cmake/templates/OpenCVConfig.cmake.in b/cmake/templates/OpenCVConfig.cmake.in
index 2e9022a355..c642863e9a 100644
--- a/cmake/templates/OpenCVConfig.cmake.in
+++ b/cmake/templates/OpenCVConfig.cmake.in
@@ -99,6 +99,8 @@ endif()
 @IPPICV_CONFIGCMAKE@
 @IPPIW_CONFIGCMAKE@
 
+@FASTCV_CONFIGCMAKE@
+
 # Some additional settings are required if OpenCV is built as static libs
 set(OpenCV_SHARED @BUILD_SHARED_LIBS@)
 
diff --git a/doc/tutorials/dnn/dnn_openvino/dnn_openvino.markdown b/doc/tutorials/dnn/dnn_openvino/dnn_openvino.markdown
index d8672325d4..a40790540a 100644
--- a/doc/tutorials/dnn/dnn_openvino/dnn_openvino.markdown
+++ b/doc/tutorials/dnn/dnn_openvino/dnn_openvino.markdown
@@ -26,3 +26,14 @@ There are 2 approaches how to get OpenCV:
 - Build OpenCV from source code against specific version of OpenVINO. This approach solves the limitations mentioned above.
 
 The instruction how to follow both approaches is provided in [OpenCV wiki](https://github.com/opencv/opencv/wiki/BuildOpenCV4OpenVINO).
+
+## Supported targets
+
+OpenVINO backend (DNN_BACKEND_INFERENCE_ENGINE) supports the following [targets](https://docs.opencv.org/4.x/d6/d0f/group__dnn.html#ga709af7692ba29788182cf573531b0ff5):
+
+- **DNN_TARGET_CPU:** Runs on the CPU, no additional dependencies required.
+- **DNN_TARGET_OPENCL, DNN_TARGET_OPENCL_FP16:** Runs on the iGPU, requires OpenCL drivers. Install [intel-opencl-icd](https://launchpad.net/ubuntu/jammy/+package/intel-opencl-icd) on Ubuntu.
+- **DNN_TARGET_MYRIAD:** Runs on Intel&reg; VPU like the [Neural Compute Stick](https://www.intel.com/content/www/us/en/products/sku/140109/intel-neural-compute-stick-2/specifications.html), to set up [see](https://www.intel.com/content/www/us/en/developer/archive/tools/neural-compute-stick.html).
+- **DNN_TARGET_HDDL:** Runs on the Intel&reg; Movidius&trade; Myriad&trade; X High Density Deep Learning VPU, for details [see](https://intelsmartedge.github.io/ido-specs/doc/building-blocks/enhanced-platform-awareness/smartedge-open_hddl/).
+- **DNN_TARGET_FPGA:** Runs on Intel&reg; Altera&reg; series FPGAs [see](https://www.intel.com/content/www/us/en/docs/programmable/768970/2025-1/getting-started-guide.html).
+- **DNN_TARGET_NPU:** Runs on the integrated Intel&reg; AI Boost processor, requires [Linux drivers](https://github.com/intel/linux-npu-driver/releases/tag/v1.17.0) OR [Windows drivers](https://www.intel.com/content/www/us/en/download/794734/intel-npu-driver-windows.html).
\ No newline at end of file
diff --git a/doc/tutorials/dnn/dnn_yolo/dnn_yolo.markdown b/doc/tutorials/dnn/dnn_yolo/dnn_yolo.markdown
index 50f98e0e71..4b5aaa7575 100644
--- a/doc/tutorials/dnn/dnn_yolo/dnn_yolo.markdown
+++ b/doc/tutorials/dnn/dnn_yolo/dnn_yolo.markdown
@@ -41,7 +41,7 @@ Assuming that we have successfully trained YOLOX model, the subsequent step invo
 running this model with OpenCV. There are several critical considerations to address before
 proceeding with this process. Let's delve into these aspects.
 
-### YOLO's Pre-proccessing & Output
+### YOLO's Pre-processing & Output
 
 Understanding the nature of inputs and outputs associated with YOLO family detectors is pivotal.
 These detectors, akin to most Deep Neural Networks (DNN), typically exhibit variation in input
diff --git a/doc/tutorials/introduction/building_fastcv/building_fastcv.markdown b/doc/tutorials/introduction/building_fastcv/building_fastcv.markdown
index 7b48714f25..a8322191c0 100644
--- a/doc/tutorials/introduction/building_fastcv/building_fastcv.markdown
+++ b/doc/tutorials/introduction/building_fastcv/building_fastcv.markdown
@@ -144,9 +144,9 @@ HAL and Extension list of APIs
 |               |pyrUp & pyrDown   |fcvPyramidCreateu8_v4                          |
 |               |cvtColor          |fcvColorRGB888toYCrCbu8_v3                     |
 |               |                  |fcvColorRGB888ToHSV888u8                       |
-|               |GaussianBlur      |fcvFilterGaussian5x5u8_v3                      |
+|               |gaussianBlur      |fcvFilterGaussian5x5u8_v3                      |
 |               |                  |fcvFilterGaussian3x3u8_v4                      |
-|               |cvWarpPerspective |fcvWarpPerspectiveu8_v5                        |
+|               |warpPerspective   |fcvWarpPerspectiveu8_v5                        |
 |               |Canny             |fcvFilterCannyu8                               |
 |               |                  |                                               |
 |CORE           |lut               | fcvTableLookupu8                              |
@@ -166,6 +166,7 @@ HAL and Extension list of APIs
 |               |                  |fcvElementMultiplyf32                          |
 |               |addWeighted       |fcvAddWeightedu8_v2                            |
 |               |subtract          |fcvImageDiffu8f32_v2                           |
+|               |SVD & solve       |fcvSVDf32_v2                                   |
 
 
 **FastCV based OpenCV Extensions APIs list :**
@@ -221,10 +222,10 @@ HAL and Extension list of APIs
 |                      |fcvFilterCorrSep17x17s16_v2                   |
 |                      |fcvFilterCorrSepNxNs16                        |
 |sobel3x3u8            |fcvImageGradientSobelPlanars8_v2              |
-|sobel3x3u9            |fcvImageGradientSobelPlanars16_v2             |
-|sobel3x3u10           |fcvImageGradientSobelPlanars16_v3             |
-|sobel3x3u11           |fcvImageGradientSobelPlanarf32_v2             |
-|sobel3x3u12           |fcvImageGradientSobelPlanarf32_v3             |
+|sobel3x3u8            |fcvImageGradientSobelPlanars16_v2             |
+|sobel3x3u8            |fcvImageGradientSobelPlanars16_v3             |
+|sobel3x3u8            |fcvImageGradientSobelPlanarf32_v2             |
+|sobel3x3u8            |fcvImageGradientSobelPlanarf32_v3             |
 |sobel                 |fcvFilterSobel3x3u8_v2                        |
 |                      |fcvFilterSobel3x3u8s16                        |
 |                      |fcvFilterSobel5x5u8s16                        |
@@ -244,3 +245,4 @@ HAL and Extension list of APIs
 |trackOpticalFlowLK    |fcvTrackLKOpticalFlowu8_v3                    |
 |                      |fcvTrackLKOpticalFlowu8                       |
 |warpPerspective2Plane |fcv2PlaneWarpPerspectiveu8                    |
+|warpPerspective       |fcvWarpPerspectiveu8_v5                       |
diff --git a/hal/carotene/include/carotene/functions.hpp b/hal/carotene/include/carotene/functions.hpp
index 06f1adf3b3..15a12e765b 100644
--- a/hal/carotene/include/carotene/functions.hpp
+++ b/hal/carotene/include/carotene/functions.hpp
@@ -1040,7 +1040,7 @@ namespace CAROTENE_NS {
                         s32 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity);
 
     /*
-        Among each pixel `p` within `src` find min and max values and its first occurences
+        Among each pixel `p` within `src` find min and max values and its first occurrences
     */
     void minMaxLoc(const Size2D &size,
                    const s8 * srcBase, ptrdiff_t srcStride,
diff --git a/hal/ipp/CMakeLists.txt b/hal/ipp/CMakeLists.txt
index c80e76bfed..bf57db6f8e 100644
--- a/hal/ipp/CMakeLists.txt
+++ b/hal/ipp/CMakeLists.txt
@@ -13,6 +13,7 @@ add_library(ipphal STATIC
     "${CMAKE_CURRENT_SOURCE_DIR}/src/norm_ipp.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/src/cart_polar_ipp.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/src/transforms_ipp.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/src/sum_ipp.cpp"
 )
 
 #TODO: HAVE_IPP_ICV and HAVE_IPP_IW added as private macro till OpenCV itself is
diff --git a/hal/ipp/include/ipp_hal_core.hpp b/hal/ipp/include/ipp_hal_core.hpp
index 6707db7290..caa8c765b2 100644
--- a/hal/ipp/include/ipp_hal_core.hpp
+++ b/hal/ipp/include/ipp_hal_core.hpp
@@ -1,3 +1,7 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
 #ifndef __IPP_HAL_CORE_HPP__
 #define __IPP_HAL_CORE_HPP__
 
@@ -32,6 +36,11 @@ int ipp_hal_normDiff(const uchar* src1, size_t src1_step, const uchar* src2, siz
 #undef cv_hal_normDiff
 #define cv_hal_normDiff ipp_hal_normDiff
 
+int ipp_hal_sum(const uchar *src_data, size_t src_step, int src_type, int width, int height, double *result);
+
+#undef cv_hal_sum
+#define cv_hal_sum ipp_hal_sum
+
 #endif
 
 int ipp_hal_polarToCart32f(const float* mag, const float* angle, float* x, float* y, int len, bool angleInDegrees);
@@ -56,4 +65,6 @@ int ipp_hal_transpose2d(const uchar* src_data, size_t src_step, uchar* dst_data,
 #undef cv_hal_transpose2d
 #define cv_hal_transpose2d ipp_hal_transpose2d
 
+//! @endcond
+
 #endif
diff --git a/hal/ipp/include/ipp_utils.hpp b/hal/ipp/include/ipp_utils.hpp
index 26ae75affd..6000bdc817 100644
--- a/hal/ipp/include/ipp_utils.hpp
+++ b/hal/ipp/include/ipp_utils.hpp
@@ -1,3 +1,7 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
 #ifndef __IPP_HAL_UTILS_HPP__
 #define __IPP_HAL_UTILS_HPP__
 
diff --git a/hal/ipp/src/cart_polar_ipp.cpp b/hal/ipp/src/cart_polar_ipp.cpp
index 39f4d4a53a..676cf5deeb 100644
--- a/hal/ipp/src/cart_polar_ipp.cpp
+++ b/hal/ipp/src/cart_polar_ipp.cpp
@@ -1,3 +1,7 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
 #include "ipp_hal_core.hpp"
 
 #include <opencv2/core/core.hpp>
diff --git a/hal/ipp/src/mean_ipp.cpp b/hal/ipp/src/mean_ipp.cpp
index 38412271b5..75500572cd 100644
--- a/hal/ipp/src/mean_ipp.cpp
+++ b/hal/ipp/src/mean_ipp.cpp
@@ -1,3 +1,7 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
 #include "ipp_hal_core.hpp"
 
 #include <opencv2/core.hpp>
diff --git a/hal/ipp/src/minmax_ipp.cpp b/hal/ipp/src/minmax_ipp.cpp
index a8d7b7cad8..ae0bdc1747 100644
--- a/hal/ipp/src/minmax_ipp.cpp
+++ b/hal/ipp/src/minmax_ipp.cpp
@@ -1,3 +1,7 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
 #include "ipp_hal_core.hpp"
 
 #include <opencv2/core.hpp>
diff --git a/hal/ipp/src/norm_ipp.cpp b/hal/ipp/src/norm_ipp.cpp
index 16b0d9bd91..95c428ac8a 100644
--- a/hal/ipp/src/norm_ipp.cpp
+++ b/hal/ipp/src/norm_ipp.cpp
@@ -1,3 +1,7 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
 #include "ipp_hal_core.hpp"
 
 #include <opencv2/core.hpp>
diff --git a/hal/ipp/src/sum_ipp.cpp b/hal/ipp/src/sum_ipp.cpp
new file mode 100644
index 0000000000..148d1a0298
--- /dev/null
+++ b/hal/ipp/src/sum_ipp.cpp
@@ -0,0 +1,59 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#include "ipp_hal_core.hpp"
+
+#include <opencv2/core.hpp>
+#include <opencv2/core/base.hpp>
+
+#if IPP_VERSION_X100 >= 700
+
+int ipp_hal_sum(const uchar *src_data, size_t src_step, int src_type, int width, int height, double *result)
+{
+    int cn = CV_MAT_CN(src_type);
+    if (cn > 4)
+    {
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    }
+
+    IppiSize sz = { width, height };
+
+    typedef IppStatus (CV_STDCALL* ippiSumFuncHint)(const void*, int, IppiSize, double *, IppHintAlgorithm);
+    typedef IppStatus (CV_STDCALL* ippiSumFuncNoHint)(const void*, int, IppiSize, double *);
+    ippiSumFuncHint ippiSumHint =
+        src_type == CV_32FC1 ? (ippiSumFuncHint)ippiSum_32f_C1R :
+        src_type == CV_32FC3 ? (ippiSumFuncHint)ippiSum_32f_C3R :
+        src_type == CV_32FC4 ? (ippiSumFuncHint)ippiSum_32f_C4R :
+        0;
+    ippiSumFuncNoHint ippiSum =
+        src_type == CV_8UC1 ? (ippiSumFuncNoHint)ippiSum_8u_C1R :
+        src_type == CV_8UC3 ? (ippiSumFuncNoHint)ippiSum_8u_C3R :
+        src_type == CV_8UC4 ? (ippiSumFuncNoHint)ippiSum_8u_C4R :
+        src_type == CV_16UC1 ? (ippiSumFuncNoHint)ippiSum_16u_C1R :
+        src_type == CV_16UC3 ? (ippiSumFuncNoHint)ippiSum_16u_C3R :
+        src_type == CV_16UC4 ? (ippiSumFuncNoHint)ippiSum_16u_C4R :
+        src_type == CV_16SC1 ? (ippiSumFuncNoHint)ippiSum_16s_C1R :
+        src_type == CV_16SC3 ? (ippiSumFuncNoHint)ippiSum_16s_C3R :
+        src_type == CV_16SC4 ? (ippiSumFuncNoHint)ippiSum_16s_C4R :
+        0;
+
+    if( ippiSumHint || ippiSum )
+    {
+        IppStatus ret = ippiSumHint ?
+        CV_INSTRUMENT_FUN_IPP(ippiSumHint, src_data, (int)src_step, sz, result, ippAlgHintAccurate) :
+        CV_INSTRUMENT_FUN_IPP(ippiSum, src_data, (int)src_step, sz, result);
+        if( ret >= 0 )
+        {
+            return CV_HAL_ERROR_OK;
+        }
+        else
+        {
+            return CV_HAL_ERROR_NOT_IMPLEMENTED;
+        }
+    }
+
+    return CV_HAL_ERROR_NOT_IMPLEMENTED;
+}
+
+#endif
diff --git a/hal/ipp/src/transforms_ipp.cpp b/hal/ipp/src/transforms_ipp.cpp
index ae38310051..83e66d52a5 100644
--- a/hal/ipp/src/transforms_ipp.cpp
+++ b/hal/ipp/src/transforms_ipp.cpp
@@ -1,3 +1,7 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
 #include "ipp_hal_core.hpp"
 
 #include <opencv2/core.hpp>
diff --git a/hal/kleidicv/CMakeLists.txt b/hal/kleidicv/CMakeLists.txt
index 9273deb3c9..5d78c83d2c 100644
--- a/hal/kleidicv/CMakeLists.txt
+++ b/hal/kleidicv/CMakeLists.txt
@@ -2,6 +2,7 @@ project(kleidicv_hal)
 
 if(HAVE_KLEIDICV)
   option(KLEIDICV_ENABLE_SME2 "" OFF) # not compatible with some CLang versions in NDK
+  option(KLEIDICV_USE_CV_NAMESPACE_IN_OPENCV_HAL "" OFF)
   include("${KLEIDICV_SOURCE_PATH}/adapters/opencv/CMakeLists.txt")
   # HACK to suppress adapters/opencv/kleidicv_hal.cpp:343:12: warning: unused function 'from_opencv' [-Wunused-function]
   target_compile_options( kleidicv_hal PRIVATE
diff --git a/hal/ndsrvp/src/bilateralFilter.cpp b/hal/ndsrvp/src/bilateralFilter.cpp
index c7a51b4199..fa92584000 100644
--- a/hal/ndsrvp/src/bilateralFilter.cpp
+++ b/hal/ndsrvp/src/bilateralFilter.cpp
@@ -156,10 +156,12 @@ int bilateralFilter(const uchar* src_data, size_t src_step,
 
     int i, j, maxk, radius;
 
-    if( sigma_color <= 0 )
-        sigma_color = 1;
-    if( sigma_space <= 0 )
-        sigma_space = 1;
+    constexpr double eps = 1e-6;
+    if( sigma_color <= eps || sigma_space <= eps )
+    {
+        src.copyTo(dst);
+        return CV_HAL_ERROR_OK;
+    }
 
     double gauss_color_coeff = -0.5/(sigma_color * sigma_color);
     double gauss_space_coeff = -0.5/(sigma_space * sigma_space);
diff --git a/hal/riscv-rvv/CMakeLists.txt b/hal/riscv-rvv/CMakeLists.txt
index 8c19800053..a0c9e628b3 100644
--- a/hal/riscv-rvv/CMakeLists.txt
+++ b/hal/riscv-rvv/CMakeLists.txt
@@ -1,9 +1,26 @@
 cmake_minimum_required(VERSION ${MIN_VER_CMAKE} FATAL_ERROR)
 
-set(HAL_LIB_NAME "")
+set(RVV_HAL_INCLUDE_DIR include)
+set(RVV_HAL_SOURCE_DIR src)
+
+file(GLOB rvv_hal_headers RELATIVE "${CMAKE_CURRENT_LIST_DIR}" "${RVV_HAL_INCLUDE_DIR}/*.hpp")
+file(GLOB rvv_hal_sources RELATIVE "${CMAKE_CURRENT_LIST_DIR}" "${RVV_HAL_SOURCE_DIR}/**/*.cpp")
+
+set(HAL_LIB_NAME "rvv_hal")
+add_library(${HAL_LIB_NAME} STATIC)
+target_sources(${HAL_LIB_NAME} PRIVATE ${rvv_hal_headers} ${rvv_hal_sources})
+
+set_target_properties(${HAL_LIB_NAME} PROPERTIES ARCHIVE_OUTPUT_DIRECTORY ${3P_LIBRARY_OUTPUT_PATH})
+if(NOT BUILD_SHARED_LIBS)
+  ocv_install_target(${HAL_LIB_NAME} EXPORT OpenCVModules ARCHIVE DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH} COMPONENT dev)
+endif()
+target_include_directories(${HAL_LIB_NAME} PRIVATE
+  ${CMAKE_CURRENT_SOURCE_DIR}
+  ${CMAKE_SOURCE_DIR}/modules/core/include
+  ${CMAKE_SOURCE_DIR}/modules/imgproc/include) #   ${CMAKE_SOURCE_DIR}/modules/features2d/include
 
 set(RVV_HAL_FOUND TRUE CACHE INTERNAL "")
 set(RVV_HAL_VERSION "0.0.1" CACHE INTERNAL "")
 set(RVV_HAL_LIBRARIES ${HAL_LIB_NAME} CACHE INTERNAL "")
-set(RVV_HAL_HEADERS "hal_rvv.hpp" CACHE INTERNAL "")
-set(RVV_HAL_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}" "${CMAKE_SOURCE_DIR}/modules/imgproc/include" CACHE INTERNAL "")
+set(RVV_HAL_HEADERS "rvv_hal.hpp" CACHE INTERNAL "")
+set(RVV_HAL_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}" CACHE INTERNAL "")
diff --git a/hal/riscv-rvv/hal_rvv.hpp b/hal/riscv-rvv/hal_rvv.hpp
deleted file mode 100644
index 8fe78bd8b9..0000000000
--- a/hal/riscv-rvv/hal_rvv.hpp
+++ /dev/null
@@ -1,65 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.
-
-#ifndef OPENCV_HAL_RVV_HPP_INCLUDED
-#define OPENCV_HAL_RVV_HPP_INCLUDED
-
-#include "opencv2/core/base.hpp"
-#include "opencv2/core/hal/interface.h"
-#include "opencv2/imgproc/hal/interface.h"
-
-#ifndef CV_HAL_RVV_071_ENABLED
-#  if defined(__GNUC__) && __GNUC__ == 10 && __GNUC_MINOR__ == 4 && defined(__THEAD_VERSION__) && defined(__riscv_v) && __riscv_v == 7000
-#    define CV_HAL_RVV_071_ENABLED 1
-#  else
-#    define CV_HAL_RVV_071_ENABLED 0
-#  endif
-#endif
-
-#if CV_HAL_RVV_071_ENABLED
-#include "version/hal_rvv_071.hpp"
-#endif
-
-#if defined(__riscv_v) && __riscv_v == 1000000
-#include "hal_rvv_1p0/types.hpp"
-#include "hal_rvv_1p0/merge.hpp" // core
-#include "hal_rvv_1p0/mean.hpp" // core
-#include "hal_rvv_1p0/dxt.hpp" // core
-#include "hal_rvv_1p0/norm.hpp" // core
-#include "hal_rvv_1p0/norm_diff.hpp" // core
-#include "hal_rvv_1p0/norm_hamming.hpp" // core
-#include "hal_rvv_1p0/convert_scale.hpp" // core
-#include "hal_rvv_1p0/minmax.hpp" // core
-#include "hal_rvv_1p0/atan.hpp" // core
-#include "hal_rvv_1p0/split.hpp" // core
-#include "hal_rvv_1p0/magnitude.hpp" // core
-#include "hal_rvv_1p0/cart_to_polar.hpp" // core
-#include "hal_rvv_1p0/polar_to_cart.hpp" // core
-#include "hal_rvv_1p0/flip.hpp" // core
-#include "hal_rvv_1p0/lut.hpp" // core
-#include "hal_rvv_1p0/exp.hpp" // core
-#include "hal_rvv_1p0/log.hpp" // core
-#include "hal_rvv_1p0/lu.hpp" // core
-#include "hal_rvv_1p0/cholesky.hpp" // core
-#include "hal_rvv_1p0/qr.hpp" // core
-#include "hal_rvv_1p0/svd.hpp" // core
-#include "hal_rvv_1p0/sqrt.hpp" // core
-#include "hal_rvv_1p0/copy_mask.hpp" // core
-#include "hal_rvv_1p0/div.hpp" // core
-#include "hal_rvv_1p0/dotprod.hpp" // core
-#include "hal_rvv_1p0/compare.hpp" // core
-#include "hal_rvv_1p0/transpose.hpp" // core
-
-#include "hal_rvv_1p0/moments.hpp" // imgproc
-#include "hal_rvv_1p0/filter.hpp" // imgproc
-#include "hal_rvv_1p0/pyramids.hpp" // imgproc
-#include "hal_rvv_1p0/color.hpp" // imgproc
-#include "hal_rvv_1p0/warp.hpp" // imgproc
-#include "hal_rvv_1p0/thresh.hpp" // imgproc
-#include "hal_rvv_1p0/histogram.hpp" // imgproc
-#include "hal_rvv_1p0/resize.hpp" // imgproc
-#include "hal_rvv_1p0/integral.hpp" // imgproc
-#endif
-
-#endif
diff --git a/hal/riscv-rvv/hal_rvv_1p0/atan.hpp b/hal/riscv-rvv/hal_rvv_1p0/atan.hpp
deleted file mode 100644
index b864fea2c1..0000000000
--- a/hal/riscv-rvv/hal_rvv_1p0/atan.hpp
+++ /dev/null
@@ -1,128 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level
-// directory of this distribution and at http://opencv.org/license.html.
-
-// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
-
-#ifndef OPENCV_HAL_RVV_ATAN_HPP_INCLUDED
-#define OPENCV_HAL_RVV_ATAN_HPP_INCLUDED
-
-#undef cv_hal_fastAtan32f
-#define cv_hal_fastAtan32f cv::cv_hal_rvv::fast_atan_32
-
-#undef cv_hal_fastAtan64f
-#define cv_hal_fastAtan64f cv::cv_hal_rvv::fast_atan_64
-
-#include <riscv_vector.h>
-
-#include <cfloat>
-
-namespace cv { namespace cv_hal_rvv {
-
-namespace detail {
-// ref: mathfuncs_core.simd.hpp
-static constexpr float pi = CV_PI;
-
-struct AtanParams
-{
-    float p1, p3, p5, p7, angle_90;
-};
-
-static constexpr AtanParams atan_params_rad {
-    0.9997878412794807F,
-    -0.3258083974640975F,
-    0.1555786518463281F,
-    -0.04432655554792128F,
-    90.F * (pi / 180.F)};
-static constexpr AtanParams atan_params_deg {
-    atan_params_rad.p1 * (180 / pi),
-    atan_params_rad.p3 * (180 / pi),
-    atan_params_rad.p5 * (180 / pi),
-    atan_params_rad.p7 * (180 / pi),
-    90.F};
-
-template <typename VEC_T>
-__attribute__((always_inline)) inline VEC_T
-    rvv_atan(VEC_T vy, VEC_T vx, size_t vl, const AtanParams& params)
-{
-    const auto ax = __riscv_vfabs(vx, vl);
-    const auto ay = __riscv_vfabs(vy, vl);
-    // Reciprocal Estimate (vfrec7) is not accurate enough to pass the test of cartToPolar.
-    const auto c = __riscv_vfdiv(__riscv_vfmin(ax, ay, vl),
-                                 __riscv_vfadd(__riscv_vfmax(ax, ay, vl), FLT_EPSILON, vl),
-                                 vl);
-    const auto c2 = __riscv_vfmul(c, c, vl);
-
-    // Using vfmadd only results in about a 2% performance improvement, but it occupies 3 additional
-    // M4 registers. (Performance test on phase32f::VectorLength::1048576: time decreased
-    // from 5.952ms to 5.805ms on Muse Pi)
-    // Additionally, when registers are nearly fully utilized (though not yet exhausted), the
-    // compiler is likely to fail to optimize and may introduce slower memory access (e.g., in
-    // cv::cv_hal_rvv::fast_atan_64).
-    // Saving registers can also make this function more reusable in other contexts.
-    // Therefore, vfmadd is not used here.
-    auto a = __riscv_vfadd(__riscv_vfmul(c2, params.p7, vl), params.p5, vl);
-    a = __riscv_vfadd(__riscv_vfmul(c2, a, vl), params.p3, vl);
-    a = __riscv_vfadd(__riscv_vfmul(c2, a, vl), params.p1, vl);
-    a = __riscv_vfmul(a, c, vl);
-
-    a = __riscv_vfrsub_mu(__riscv_vmflt(ax, ay, vl), a, a, params.angle_90, vl);
-    a = __riscv_vfrsub_mu(__riscv_vmflt(vx, 0.F, vl), a, a, params.angle_90 * 2, vl);
-    a = __riscv_vfrsub_mu(__riscv_vmflt(vy, 0.F, vl), a, a, params.angle_90 * 4, vl);
-
-    return a;
-}
-
-}  // namespace detail
-
-inline int fast_atan_32(const float* y, const float* x, float* dst, size_t n, bool angle_in_deg)
-{
-    auto atan_params = angle_in_deg ? detail::atan_params_deg : detail::atan_params_rad;
-
-    for (size_t vl = 0; n > 0; n -= vl)
-    {
-        vl = __riscv_vsetvl_e32m4(n);
-
-        auto vy = __riscv_vle32_v_f32m4(y, vl);
-        auto vx = __riscv_vle32_v_f32m4(x, vl);
-
-        auto a = detail::rvv_atan(vy, vx, vl, atan_params);
-
-        __riscv_vse32(dst, a, vl);
-
-        x += vl;
-        y += vl;
-        dst += vl;
-    }
-
-    return CV_HAL_ERROR_OK;
-}
-
-inline int fast_atan_64(const double* y, const double* x, double* dst, size_t n, bool angle_in_deg)
-{
-    // this also uses float32 version, ref: mathfuncs_core.simd.hpp
-
-    auto atan_params = angle_in_deg ? detail::atan_params_deg : detail::atan_params_rad;
-
-    for (size_t vl = 0; n > 0; n -= vl)
-    {
-        vl = __riscv_vsetvl_e64m8(n);
-
-        auto vy = __riscv_vfncvt_f(__riscv_vle64_v_f64m8(y, vl), vl);
-        auto vx = __riscv_vfncvt_f(__riscv_vle64_v_f64m8(x, vl), vl);
-
-        auto a = detail::rvv_atan(vy, vx, vl, atan_params);
-
-        __riscv_vse64(dst, __riscv_vfwcvt_f(a, vl), vl);
-
-        x += vl;
-        y += vl;
-        dst += vl;
-    }
-
-    return CV_HAL_ERROR_OK;
-}
-
-}} // namespace cv::cv_hal_rvv
-
-#endif //OPENCV_HAL_RVV_ATAN_HPP_INCLUDED
diff --git a/hal/riscv-rvv/hal_rvv_1p0/common.hpp b/hal/riscv-rvv/hal_rvv_1p0/common.hpp
deleted file mode 100644
index 9fc01d2897..0000000000
--- a/hal/riscv-rvv/hal_rvv_1p0/common.hpp
+++ /dev/null
@@ -1,52 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.
-//
-// Copyright (C) 2025, SpaceMIT Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-
-#ifndef OPENCV_HAL_RVV_COMMON_HPP_INCLUDED
-#define OPENCV_HAL_RVV_COMMON_HPP_INCLUDED
-
-#include <riscv_vector.h>
-
-namespace cv { namespace cv_hal_rvv { namespace custom_intrin {
-
-#define CV_HAL_RVV_NOOP(a) (a)
-
-#define CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(_Tpvs, _Tpvd, shift, suffix) \
-    inline _Tpvd __riscv_vabs(const _Tpvs& v, const int vl) { \
-        _Tpvs mask = __riscv_vsra(v, shift, vl); \
-        _Tpvs v_xor = __riscv_vxor(v, mask, vl); \
-        return __riscv_vreinterpret_##suffix( \
-            __riscv_vsub(v_xor, mask, vl) \
-        ); \
-    }
-
-CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint8m2_t,  vuint8m2_t,  7,  u8m2)
-CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint8m8_t,  vuint8m8_t,  7,  u8m8)
-CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint16m4_t, vuint16m4_t, 15, u16m4)
-CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint16m8_t, vuint16m8_t, 15, u16m8)
-CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint32m4_t, vuint32m4_t, 31, u32m4)
-CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint32m8_t, vuint32m8_t, 31, u32m8)
-
-#define CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(_Tpvs, _Tpvd, cast, sub, max, min) \
-    inline _Tpvd __riscv_vabd(const _Tpvs& v1, const _Tpvs& v2, const int vl) { \
-        return cast(__riscv_##sub(__riscv_##max(v1, v2, vl), __riscv_##min(v1, v2, vl), vl)); \
-    }
-
-CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vuint8m4_t, vuint8m4_t, CV_HAL_RVV_NOOP, vsub, vmaxu, vminu)
-CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vuint8m8_t, vuint8m8_t, CV_HAL_RVV_NOOP, vsub, vmaxu, vminu)
-CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vuint16m2_t, vuint16m2_t, CV_HAL_RVV_NOOP, vsub, vmaxu, vminu)
-CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vuint16m8_t, vuint16m8_t, CV_HAL_RVV_NOOP, vsub, vmaxu, vminu)
-
-CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint8m4_t, vuint8m4_t, __riscv_vreinterpret_u8m4, vsub, vmax, vmin)
-CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint8m8_t, vuint8m8_t, __riscv_vreinterpret_u8m8, vsub, vmax, vmin)
-CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint16m2_t, vuint16m2_t, __riscv_vreinterpret_u16m2, vsub, vmax, vmin)
-CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint16m8_t, vuint16m8_t, __riscv_vreinterpret_u16m8, vsub, vmax, vmin)
-CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint32m4_t, vuint32m4_t, __riscv_vreinterpret_u32m4, vsub, vmax, vmin)
-CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint32m8_t, vuint32m8_t, __riscv_vreinterpret_u32m8, vsub, vmax, vmin)
-
-}}} // cv::cv_hal_rvv::custom_intrin
-
-#endif
diff --git a/hal/riscv-rvv/hal_rvv_1p0/div.hpp b/hal/riscv-rvv/hal_rvv_1p0/div.hpp
deleted file mode 100644
index ccbeb6403d..0000000000
--- a/hal/riscv-rvv/hal_rvv_1p0/div.hpp
+++ /dev/null
@@ -1,268 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.
-//
-// Copyright (C) 2025, SpaceMIT Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-
-#ifndef OPENCV_HAL_RVV_DIV_HPP_INCLUDED
-#define OPENCV_HAL_RVV_DIV_HPP_INCLUDED
-
-#include <riscv_vector.h>
-#include <limits>
-
-namespace cv { namespace cv_hal_rvv { namespace div {
-
-namespace {
-
-    inline size_t setvl(int l) { return __riscv_vsetvl_e8m2(l); }
-
-    inline   vuint8m2_t vle(const uint8_t  *p, int vl) { return __riscv_vle8_v_u8m2(p, vl); }
-    inline    vint8m2_t vle(const int8_t   *p, int vl) { return __riscv_vle8_v_i8m2(p, vl); }
-    inline  vuint16m4_t vle(const uint16_t *p, int vl) { return __riscv_vle16_v_u16m4(p, vl); }
-    inline   vint16m4_t vle(const int16_t  *p, int vl) { return __riscv_vle16_v_i16m4(p, vl); }
-    inline   vint32m8_t vle(const int      *p, int vl) { return __riscv_vle32_v_i32m8(p, vl); }
-    inline vfloat32m8_t vle(const float    *p, int vl) { return __riscv_vle32_v_f32m8(p, vl); }
-
-    inline void vse(uint8_t  *p, const   vuint8m2_t &v, int vl) { __riscv_vse8(p, v, vl); }
-    inline void vse(int8_t   *p, const    vint8m2_t &v, int vl) { __riscv_vse8(p, v, vl); }
-    inline void vse(uint16_t *p, const  vuint16m4_t &v, int vl) { __riscv_vse16(p, v, vl); }
-    inline void vse(int16_t  *p, const   vint16m4_t &v, int vl) { __riscv_vse16(p, v, vl); }
-    inline void vse(int      *p, const   vint32m8_t &v, int vl) { __riscv_vse32(p, v, vl); }
-    inline void vse(float    *p, const vfloat32m8_t &v, int vl) { __riscv_vse32(p, v, vl); }
-
-    inline vuint16m4_t ext(const  vuint8m2_t &v, const int vl) { return __riscv_vzext_vf2(v, vl); }
-    inline  vint16m4_t ext(const   vint8m2_t &v, const int vl) { return __riscv_vsext_vf2(v, vl); }
-    inline vuint32m8_t ext(const vuint16m4_t &v, const int vl) { return __riscv_vzext_vf2(v, vl); }
-    inline  vint32m8_t ext(const  vint16m4_t &v, const int vl) { return __riscv_vsext_vf2(v, vl); }
-
-    inline  vuint8m2_t nclip(const vuint16m4_t &v, const int vl) { return __riscv_vnclipu(v, 0, __RISCV_VXRM_RNU, vl); }
-    inline   vint8m2_t nclip(const  vint16m4_t &v, const int vl) { return __riscv_vnclip(v, 0, __RISCV_VXRM_RNU, vl); }
-    inline vuint16m4_t nclip(const vuint32m8_t &v, const int vl) { return __riscv_vnclipu(v, 0, __RISCV_VXRM_RNU, vl); }
-    inline  vint16m4_t nclip(const  vint32m8_t &v, const int vl) { return __riscv_vnclip(v, 0, __RISCV_VXRM_RNU, vl); }
-
-    template <typename VT> inline
-    VT div_sat(const VT &v1, const VT &v2, const float scale, const int vl) {
-        return nclip(div_sat(ext(v1, vl), ext(v2, vl), scale, vl), vl);
-    }
-    template <> inline
-    vint32m8_t div_sat(const vint32m8_t &v1, const vint32m8_t &v2, const float scale, const int vl) {
-        auto f1 = __riscv_vfcvt_f(v1, vl);
-        auto f2 = __riscv_vfcvt_f(v2, vl);
-        auto res = __riscv_vfmul(f1, __riscv_vfrdiv(f2, scale, vl), vl);
-        return __riscv_vfcvt_x(res, vl);
-    }
-    template <> inline
-    vuint32m8_t div_sat(const vuint32m8_t &v1, const vuint32m8_t &v2, const float scale, const int vl) {
-        auto f1 = __riscv_vfcvt_f(v1, vl);
-        auto f2 = __riscv_vfcvt_f(v2, vl);
-        auto res = __riscv_vfmul(f1, __riscv_vfrdiv(f2, scale, vl), vl);
-        return __riscv_vfcvt_xu(res, vl);
-    }
-
-    template <typename VT> inline
-    VT recip_sat(const VT &v, const float scale, const int vl) {
-        return nclip(recip_sat(ext(v, vl), scale, vl), vl);
-    }
-    template <> inline
-    vint32m8_t recip_sat(const vint32m8_t &v, const float scale, const int vl) {
-        auto f = __riscv_vfcvt_f(v, vl);
-        auto res = __riscv_vfrdiv(f, scale, vl);
-        return __riscv_vfcvt_x(res, vl);
-    }
-    template <> inline
-    vuint32m8_t recip_sat(const vuint32m8_t &v, const float scale, const int vl) {
-        auto f = __riscv_vfcvt_f(v, vl);
-        auto res = __riscv_vfrdiv(f, scale, vl);
-        return __riscv_vfcvt_xu(res, vl);
-    }
-
-} // anonymous
-
-#undef cv_hal_div8u
-#define cv_hal_div8u cv::cv_hal_rvv::div::div<uint8_t>
-#undef cv_hal_div8s
-#define cv_hal_div8s cv::cv_hal_rvv::div::div<int8_t>
-#undef cv_hal_div16u
-#define cv_hal_div16u cv::cv_hal_rvv::div::div<uint16_t>
-#undef cv_hal_div16s
-#define cv_hal_div16s cv::cv_hal_rvv::div::div<int16_t>
-#undef cv_hal_div32s
-#define cv_hal_div32s cv::cv_hal_rvv::div::div<int>
-#undef cv_hal_div32f
-#define cv_hal_div32f cv::cv_hal_rvv::div::div<float>
-// #undef cv_hal_div64f
-// #define cv_hal_div64f cv::cv_hal_rvv::div::div<double>
-
-template <typename ST> inline
-int div(const ST *src1, size_t step1, const ST *src2, size_t step2,
-         ST *dst, size_t step, int width, int height, float scale) {
-    if (scale == 0.f ||
-        (scale * static_cast<float>(std::numeric_limits<ST>::max())) <  1.f &&
-        (scale * static_cast<float>(std::numeric_limits<ST>::max())) > -1.f) {
-        for (int h = 0; h < height; h++) {
-            ST *dst_h = reinterpret_cast<ST*>((uchar*)dst + h * step);
-            std::memset(dst_h, 0, sizeof(ST) * width);
-        }
-        return CV_HAL_ERROR_OK;
-    }
-
-    for (int h = 0; h < height; h++) {
-        const ST *src1_h = reinterpret_cast<const ST*>((const uchar*)src1 + h * step1);
-        const ST *src2_h = reinterpret_cast<const ST*>((const uchar*)src2 + h * step2);
-        ST *dst_h = reinterpret_cast<ST*>((uchar*)dst + h * step);
-
-        int vl;
-        for (int w = 0; w < width; w += vl) {
-            vl = setvl(width - w);
-
-            auto v1 = vle(src1_h + w, vl);
-            auto v2 = vle(src2_h + w, vl);
-
-            auto mask = __riscv_vmseq(v2, 0, vl);
-            vse(dst_h + w, __riscv_vmerge(div_sat(v1, v2, scale, vl), 0, mask, vl), vl);
-        }
-    }
-
-    return CV_HAL_ERROR_OK;
-}
-
-template <> inline
-int div(const float *src1, size_t step1, const float *src2, size_t step2,
-        float *dst, size_t step, int width, int height, float scale) {
-    if (scale == 0.f) {
-        for (int h = 0; h < height; h++) {
-            float *dst_h = reinterpret_cast<float*>((uchar*)dst + h * step);
-            std::memset(dst_h, 0, sizeof(float) * width);
-        }
-        return CV_HAL_ERROR_OK;
-    }
-
-    if (std::fabs(scale - 1.f) < FLT_EPSILON) {
-        for (int h = 0; h < height; h++) {
-            const float *src1_h = reinterpret_cast<const float*>((const uchar*)src1 + h * step1);
-            const float *src2_h = reinterpret_cast<const float*>((const uchar*)src2 + h * step2);
-            float *dst_h = reinterpret_cast<float*>((uchar*)dst + h * step);
-
-            int vl;
-            for (int w = 0; w < width; w += vl) {
-                vl = setvl(width - w);
-
-                auto v1 = vle(src1_h + w, vl);
-                auto v2 = vle(src2_h + w, vl);
-
-                vse(dst_h + w, __riscv_vfmul(v1, __riscv_vfrdiv(v2, 1.f, vl), vl), vl);
-            }
-        }
-    } else {
-        for (int h = 0; h < height; h++) {
-            const float *src1_h = reinterpret_cast<const float*>((const uchar*)src1 + h * step1);
-            const float *src2_h = reinterpret_cast<const float*>((const uchar*)src2 + h * step2);
-            float *dst_h = reinterpret_cast<float*>((uchar*)dst + h * step);
-
-            int vl;
-            for (int w = 0; w < width; w += vl) {
-                vl = setvl(width - w);
-
-                auto v1 = vle(src1_h + w, vl);
-                auto v2 = vle(src2_h + w, vl);
-
-                vse(dst_h + w, __riscv_vfmul(v1, __riscv_vfrdiv(v2, scale, vl), vl), vl);
-            }
-        }
-    }
-
-    return CV_HAL_ERROR_OK;
-}
-
-#undef cv_hal_recip8u
-#define cv_hal_recip8u cv::cv_hal_rvv::div::recip<uint8_t>
-#undef cv_hal_recip8s
-#define cv_hal_recip8s cv::cv_hal_rvv::div::recip<int8_t>
-#undef cv_hal_recip16u
-#define cv_hal_recip16u cv::cv_hal_rvv::div::recip<uint16_t>
-#undef cv_hal_recip16s
-#define cv_hal_recip16s cv::cv_hal_rvv::div::recip<int16_t>
-#undef cv_hal_recip32s
-#define cv_hal_recip32s cv::cv_hal_rvv::div::recip<int>
-#undef cv_hal_recip32f
-#define cv_hal_recip32f cv::cv_hal_rvv::div::recip<float>
-// #undef cv_hal_recip64f
-// #define cv_hal_recip64f cv::cv_hal_rvv::div::recip<double>
-
-template <typename ST> inline
-int recip(const ST *src_data, size_t src_step, ST *dst_data, size_t dst_step,
-          int width, int height, float scale) {
-    if (scale == 0.f || scale < 1.f && scale > -1.f) {
-        for (int h = 0; h < height; h++) {
-            ST *dst_h = reinterpret_cast<ST*>((uchar*)dst_data + h * dst_step);
-            std::memset(dst_h, 0, sizeof(ST) * width);
-        }
-        return CV_HAL_ERROR_OK;
-    }
-
-    for (int h = 0; h < height; h++) {
-        const ST *src_h = reinterpret_cast<const ST*>((const uchar*)src_data + h * src_step);
-        ST *dst_h = reinterpret_cast<ST*>((uchar*)dst_data + h * dst_step);
-
-        int vl;
-        for (int w = 0; w < width; w += vl) {
-            vl = setvl(width - w);
-
-            auto v = vle(src_h + w, vl);
-
-            auto mask = __riscv_vmseq(v, 0, vl);
-            vse(dst_h + w, __riscv_vmerge(recip_sat(v, scale, vl), 0, mask, vl), vl);
-        }
-    }
-
-    return CV_HAL_ERROR_OK;
-}
-
-template <> inline
-int recip(const float *src_data, size_t src_step, float *dst_data, size_t dst_step,
-          int width, int height, float scale) {
-    if (scale == 0.f) {
-        for (int h = 0; h < height; h++) {
-            float *dst_h = reinterpret_cast<float*>((uchar*)dst_data + h * dst_step);
-            std::memset(dst_h, 0, sizeof(float) * width);
-        }
-        return CV_HAL_ERROR_OK;
-    }
-
-    if (std::fabs(scale - 1.f) < FLT_EPSILON) {
-        for (int h = 0; h < height; h++) {
-            const float *src_h = reinterpret_cast<const float*>((const uchar*)src_data + h * src_step);
-            float *dst_h = reinterpret_cast<float*>((uchar*)dst_data + h * dst_step);
-
-            int vl;
-            for (int w = 0; w < width; w += vl) {
-                vl = setvl(width - w);
-
-                auto v = vle(src_h + w, vl);
-
-                vse(dst_h + w, __riscv_vfrdiv(v, 1.f, vl), vl);
-            }
-        }
-    } else {
-        for (int h = 0; h < height; h++) {
-            const float *src_h = reinterpret_cast<const float*>((const uchar*)src_data + h * src_step);
-            float *dst_h = reinterpret_cast<float*>((uchar*)dst_data + h * dst_step);
-
-            int vl;
-            for (int w = 0; w < width; w += vl) {
-                vl = setvl(width - w);
-
-                auto v = vle(src_h + w, vl);
-
-                vse(dst_h + w, __riscv_vfrdiv(v, scale, vl), vl);
-            }
-        }
-    }
-
-    return CV_HAL_ERROR_OK;
-}
-
-}}} // cv::cv_hal_rvv::div
-
-#endif // OPENCV_HAL_RVV_DIV_HPP_INCLUDED
diff --git a/hal/riscv-rvv/hal_rvv_1p0/filter.hpp b/hal/riscv-rvv/hal_rvv_1p0/filter.hpp
deleted file mode 100644
index 85949137e3..0000000000
--- a/hal/riscv-rvv/hal_rvv_1p0/filter.hpp
+++ /dev/null
@@ -1,2553 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.
-
-// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
-
-#ifndef OPENCV_HAL_RVV_FILTER_HPP_INCLUDED
-#define OPENCV_HAL_RVV_FILTER_HPP_INCLUDED
-
-#include <riscv_vector.h>
-
-struct cvhalFilter2D;
-
-namespace cv { namespace cv_hal_rvv {
-
-namespace filter {
-#undef cv_hal_filterInit
-#undef cv_hal_filter
-#undef cv_hal_filterFree
-#define cv_hal_filterInit cv::cv_hal_rvv::filter::filterInit
-#define cv_hal_filter cv::cv_hal_rvv::filter::filter
-#define cv_hal_filterFree cv::cv_hal_rvv::filter::filterFree
-
-class FilterInvoker : public ParallelLoopBody
-{
-public:
-    template<typename... Args>
-    FilterInvoker(std::function<int(int, int, Args...)> _func, Args&&... args)
-    {
-        func = std::bind(_func, std::placeholders::_1, std::placeholders::_2, std::forward<Args>(args)...);
-    }
-
-    virtual void operator()(const Range& range) const override
-    {
-        func(range.start, range.end);
-    }
-
-private:
-    std::function<int(int, int)> func;
-};
-
-template<typename... Args>
-static inline int invoke(int height, std::function<int(int, int, Args...)> func, Args&&... args)
-{
-    cv::parallel_for_(Range(1, height), FilterInvoker(func, std::forward<Args>(args)...), cv::getNumThreads());
-    return func(0, 1, std::forward<Args>(args)...);
-}
-
-static inline int borderInterpolate( int p, int len, int borderType )
-{
-    if ((unsigned)p < (unsigned)len)
-        ;
-    else if (borderType == BORDER_REPLICATE)
-        p = p < 0 ? 0 : len - 1;
-    else if (borderType == BORDER_REFLECT || borderType == BORDER_REFLECT_101)
-    {
-        int delta = borderType == BORDER_REFLECT_101;
-        if (len == 1)
-            return 0;
-        do
-        {
-            if (p < 0)
-                p = -p - 1 + delta;
-            else
-                p = len - 1 - (p - len) - delta;
-        }
-        while( (unsigned)p >= (unsigned)len );
-    }
-    else if (borderType == BORDER_WRAP)
-    {
-        if (p < 0)
-            p -= ((p-len+1)/len)*len;
-        if (p >= len)
-            p %= len;
-    }
-    else if (borderType == BORDER_CONSTANT)
-        p = -1;
-    return p;
-}
-
-struct Filter2D
-{
-    const uchar* kernel_data;
-    size_t kernel_step;
-    int kernel_type;
-    int kernel_width;
-    int kernel_height;
-    int src_type;
-    int dst_type;
-    int borderType;
-    double delta;
-    int anchor_x;
-    int anchor_y;
-};
-
-inline int filterInit(cvhalFilter2D** context, uchar* kernel_data, size_t kernel_step, int kernel_type, int kernel_width, int kernel_height, int /*max_width*/, int /*max_height*/, int src_type, int dst_type, int borderType, double delta, int anchor_x, int anchor_y, bool /*allowSubmatrix*/, bool /*allowInplace*/)
-{
-    if (kernel_type != CV_32FC1 || src_type != CV_8UC4 || dst_type != CV_8UC4)
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-    if (kernel_width != kernel_height)
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-    if (kernel_width != 3 && kernel_width != 5)
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-    if ((borderType & ~BORDER_ISOLATED) == BORDER_WRAP)
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-
-    anchor_x = anchor_x < 0 ? kernel_width  / 2 : anchor_x;
-    anchor_y = anchor_y < 0 ? kernel_height / 2 : anchor_y;
-    *context = reinterpret_cast<cvhalFilter2D*>(new Filter2D{kernel_data, kernel_step, kernel_type, kernel_width, kernel_height, src_type, dst_type, borderType, delta, anchor_x, anchor_y});
-    return CV_HAL_ERROR_OK;
-}
-
-static void process3(int anchor, int left, int right, float delta, const float* kernel, const uchar* row0, const uchar* row1, const uchar* row2, uchar* dst)
-{
-    int vl;
-    for (int i = left; i < right; i += vl)
-    {
-        vl = __riscv_vsetvl_e8m1(right - i);
-        auto s0 = __riscv_vfmv_v_f_f32m4(delta, vl);
-        auto s1 = __riscv_vfmv_v_f_f32m4(delta, vl);
-        auto s2 = __riscv_vfmv_v_f_f32m4(delta, vl);
-        auto s3 = __riscv_vfmv_v_f_f32m4(delta, vl);
-
-        auto addshift = [&](vfloat32m4_t a, vfloat32m4_t b, float k0, float k1, float k2, float r1, float r2) {
-            a = __riscv_vfmacc(a, k0, b, vl);
-            b = __riscv_vfslide1down(b, r1, vl);
-            a = __riscv_vfmacc(a, k1, b, vl);
-            b = __riscv_vfslide1down(b, r2, vl);
-            return __riscv_vfmacc(a, k2, b, vl);
-        };
-        auto loadsrc = [&](const uchar* row, float k0, float k1, float k2) {
-            if (!row) return;
-
-            const uchar* extra = row + (i - anchor) * 4;
-            auto src = __riscv_vlseg4e8_v_u8m1x4(extra, vl);
-            auto v0 = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vget_v_u8m1x4_u8m1(src, 0), vl), vl);
-            auto v1 = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vget_v_u8m1x4_u8m1(src, 1), vl), vl);
-            auto v2 = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vget_v_u8m1x4_u8m1(src, 2), vl), vl);
-            auto v3 = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vget_v_u8m1x4_u8m1(src, 3), vl), vl);
-
-            extra += vl * 4;
-            s0 = addshift(s0, v0, k0, k1, k2, extra[0], extra[4]);
-            s1 = addshift(s1, v1, k0, k1, k2, extra[1], extra[5]);
-            s2 = addshift(s2, v2, k0, k1, k2, extra[2], extra[6]);
-            s3 = addshift(s3, v3, k0, k1, k2, extra[3], extra[7]);
-        };
-
-        loadsrc(row0, kernel[0], kernel[1], kernel[2]);
-        loadsrc(row1, kernel[3], kernel[4], kernel[5]);
-        loadsrc(row2, kernel[6], kernel[7], kernel[8]);
-        vuint8m1x4_t val{};
-        val = __riscv_vset_v_u8m1_u8m1x4(val, 0, __riscv_vnclipu(__riscv_vfncvt_xu(s0, vl), 0, __RISCV_VXRM_RNU, vl));
-        val = __riscv_vset_v_u8m1_u8m1x4(val, 1, __riscv_vnclipu(__riscv_vfncvt_xu(s1, vl), 0, __RISCV_VXRM_RNU, vl));
-        val = __riscv_vset_v_u8m1_u8m1x4(val, 2, __riscv_vnclipu(__riscv_vfncvt_xu(s2, vl), 0, __RISCV_VXRM_RNU, vl));
-        val = __riscv_vset_v_u8m1_u8m1x4(val, 3, __riscv_vnclipu(__riscv_vfncvt_xu(s3, vl), 0, __RISCV_VXRM_RNU, vl));
-        __riscv_vsseg4e8(dst + i * 4, val, vl);
-    }
-}
-
-static void process5(int anchor, int left, int right, float delta, const float* kernel, const uchar* row0, const uchar* row1, const uchar* row2, const uchar* row3, const uchar* row4, uchar* dst)
-{
-    int vl;
-    for (int i = left; i < right; i += vl)
-    {
-        vl = __riscv_vsetvl_e8m1(right - i);
-        auto s0 = __riscv_vfmv_v_f_f32m4(delta, vl);
-        auto s1 = __riscv_vfmv_v_f_f32m4(delta, vl);
-        auto s2 = __riscv_vfmv_v_f_f32m4(delta, vl);
-        auto s3 = __riscv_vfmv_v_f_f32m4(delta, vl);
-
-        auto addshift = [&](vfloat32m4_t a, vfloat32m4_t b, float k0, float k1, float k2, float k3, float k4, float r1, float r2, float r3, float r4) {
-            a = __riscv_vfmacc(a, k0, b, vl);
-            b = __riscv_vfslide1down(b, r1, vl);
-            a = __riscv_vfmacc(a, k1, b, vl);
-            b = __riscv_vfslide1down(b, r2, vl);
-            a = __riscv_vfmacc(a, k2, b, vl);
-            b = __riscv_vfslide1down(b, r3, vl);
-            a = __riscv_vfmacc(a, k3, b, vl);
-            b = __riscv_vfslide1down(b, r4, vl);
-            return __riscv_vfmacc(a, k4, b, vl);
-        };
-        auto loadsrc = [&](const uchar* row, float k0, float k1, float k2, float k3, float k4) {
-            if (!row) return;
-
-            const uchar* extra = row + (i - anchor) * 4;
-            auto src = __riscv_vlseg4e8_v_u8m1x4(extra, vl);
-            auto v0 = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vget_v_u8m1x4_u8m1(src, 0), vl), vl);
-            auto v1 = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vget_v_u8m1x4_u8m1(src, 1), vl), vl);
-            auto v2 = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vget_v_u8m1x4_u8m1(src, 2), vl), vl);
-            auto v3 = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vget_v_u8m1x4_u8m1(src, 3), vl), vl);
-
-            extra += vl * 4;
-            s0 = addshift(s0, v0, k0, k1, k2, k3, k4, extra[0], extra[4], extra[ 8], extra[12]);
-            s1 = addshift(s1, v1, k0, k1, k2, k3, k4, extra[1], extra[5], extra[ 9], extra[13]);
-            s2 = addshift(s2, v2, k0, k1, k2, k3, k4, extra[2], extra[6], extra[10], extra[14]);
-            s3 = addshift(s3, v3, k0, k1, k2, k3, k4, extra[3], extra[7], extra[11], extra[15]);
-        };
-
-        loadsrc(row0, kernel[ 0], kernel[ 1], kernel[ 2], kernel[ 3], kernel[ 4]);
-        loadsrc(row1, kernel[ 5], kernel[ 6], kernel[ 7], kernel[ 8], kernel[ 9]);
-        loadsrc(row2, kernel[10], kernel[11], kernel[12], kernel[13], kernel[14]);
-        loadsrc(row3, kernel[15], kernel[16], kernel[17], kernel[18], kernel[19]);
-        loadsrc(row4, kernel[20], kernel[21], kernel[22], kernel[23], kernel[24]);
-        vuint8m1x4_t val{};
-        val = __riscv_vset_v_u8m1_u8m1x4(val, 0, __riscv_vnclipu(__riscv_vfncvt_xu(s0, vl), 0, __RISCV_VXRM_RNU, vl));
-        val = __riscv_vset_v_u8m1_u8m1x4(val, 1, __riscv_vnclipu(__riscv_vfncvt_xu(s1, vl), 0, __RISCV_VXRM_RNU, vl));
-        val = __riscv_vset_v_u8m1_u8m1x4(val, 2, __riscv_vnclipu(__riscv_vfncvt_xu(s2, vl), 0, __RISCV_VXRM_RNU, vl));
-        val = __riscv_vset_v_u8m1_u8m1x4(val, 3, __riscv_vnclipu(__riscv_vfncvt_xu(s3, vl), 0, __RISCV_VXRM_RNU, vl));
-        __riscv_vsseg4e8(dst + i * 4, val, vl);
-    }
-}
-
-// the algorithm is copied from 3rdparty/carotene/src/convolution.cpp,
-// in the function void CAROTENE_NS::convolution
-template<int ksize>
-static inline int filter(int start, int end, Filter2D* data, const uchar* src_data, size_t src_step, uchar* dst_data, int width, int height, int full_width, int full_height, int offset_x, int offset_y)
-{
-    float kernel[ksize * ksize];
-    for (int i = 0; i < ksize * ksize; i++)
-    {
-        kernel[i] = reinterpret_cast<const float*>(data->kernel_data + (i / ksize) * data->kernel_step)[i % ksize];
-    }
-
-    constexpr int noval = std::numeric_limits<int>::max();
-    auto access = [&](int x, int y) {
-        int pi, pj;
-        if (data->borderType & BORDER_ISOLATED)
-        {
-            pi = borderInterpolate(x - data->anchor_y, height, data->borderType & ~BORDER_ISOLATED);
-            pj = borderInterpolate(y - data->anchor_x, width , data->borderType & ~BORDER_ISOLATED);
-            pi = pi < 0 ? noval : pi;
-            pj = pj < 0 ? noval : pj;
-        }
-        else
-        {
-            pi = borderInterpolate(offset_y + x - data->anchor_y, full_height, data->borderType);
-            pj = borderInterpolate(offset_x + y - data->anchor_x, full_width , data->borderType);
-            pi = pi < 0 ? noval : pi - offset_y;
-            pj = pj < 0 ? noval : pj - offset_x;
-        }
-        return std::make_pair(pi, pj);
-    };
-
-    auto process = [&](int x, int y) {
-        float sum0, sum1, sum2, sum3;
-        sum0 = sum1 = sum2 = sum3 = data->delta;
-        for (int i = 0; i < ksize * ksize; i++)
-        {
-            auto p = access(x + i / ksize, y + i % ksize);
-            if (p.first != noval && p.second != noval)
-            {
-                sum0 += kernel[i] * src_data[p.first * src_step + p.second * 4    ];
-                sum1 += kernel[i] * src_data[p.first * src_step + p.second * 4 + 1];
-                sum2 += kernel[i] * src_data[p.first * src_step + p.second * 4 + 2];
-                sum3 += kernel[i] * src_data[p.first * src_step + p.second * 4 + 3];
-            }
-        }
-        dst_data[(x * width + y) * 4    ] = std::max(0, std::min((int)std::round(sum0), (int)std::numeric_limits<uchar>::max()));
-        dst_data[(x * width + y) * 4 + 1] = std::max(0, std::min((int)std::round(sum1), (int)std::numeric_limits<uchar>::max()));
-        dst_data[(x * width + y) * 4 + 2] = std::max(0, std::min((int)std::round(sum2), (int)std::numeric_limits<uchar>::max()));
-        dst_data[(x * width + y) * 4 + 3] = std::max(0, std::min((int)std::round(sum3), (int)std::numeric_limits<uchar>::max()));
-    };
-
-    const int left = data->anchor_x, right = width - (ksize - 1 - data->anchor_x);
-    for (int i = start; i < end; i++)
-    {
-        if (left >= right)
-        {
-            for (int j = 0; j < width; j++)
-                process(i, j);
-        }
-        else
-        {
-            for (int j = 0; j < left; j++)
-                process(i, j);
-            for (int j = right; j < width; j++)
-                process(i, j);
-
-            const uchar* row0 = access(i    , 0).first == noval ? nullptr : src_data + access(i    , 0).first * src_step;
-            const uchar* row1 = access(i + 1, 0).first == noval ? nullptr : src_data + access(i + 1, 0).first * src_step;
-            const uchar* row2 = access(i + 2, 0).first == noval ? nullptr : src_data + access(i + 2, 0).first * src_step;
-            if (ksize == 3)
-            {
-                process3(data->anchor_x, left, right, data->delta, kernel, row0, row1, row2, dst_data + i * width * 4);
-            }
-            else
-            {
-                const uchar* row3 = access(i + 3, 0).first == noval ? nullptr : src_data + access(i + 3, 0).first * src_step;
-                const uchar* row4 = access(i + 4, 0).first == noval ? nullptr : src_data + access(i + 4, 0).first * src_step;
-                process5(data->anchor_x, left, right, data->delta, kernel, row0, row1, row2, row3, row4, dst_data + i * width * 4);
-            }
-        }
-    }
-
-    return CV_HAL_ERROR_OK;
-}
-
-inline int filter(cvhalFilter2D* context, uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int full_width, int full_height, int offset_x, int offset_y)
-{
-    Filter2D* data = reinterpret_cast<Filter2D*>(context);
-    std::vector<uchar> dst(width * height * 4);
-
-    int res = CV_HAL_ERROR_NOT_IMPLEMENTED;
-    switch (data->kernel_width)
-    {
-    case 3:
-        res = invoke(height, {filter<3>}, data, src_data, src_step, dst.data(), width, height, full_width, full_height, offset_x, offset_y);
-        break;
-    case 5:
-        res = invoke(height, {filter<5>}, data, src_data, src_step, dst.data(), width, height, full_width, full_height, offset_x, offset_y);
-        break;
-    }
-
-    for (int i = 0; i < height; i++)
-        memcpy(dst_data + i * dst_step, dst.data() + i * width * 4, width * 4);
-    return res;
-}
-
-inline int filterFree(cvhalFilter2D* context)
-{
-    delete reinterpret_cast<Filter2D*>(context);
-    return CV_HAL_ERROR_OK;
-}
-} // cv::cv_hal_rvv::filter
-
-namespace sepFilter {
-#undef cv_hal_sepFilterInit
-#undef cv_hal_sepFilter
-#undef cv_hal_sepFilterFree
-#define cv_hal_sepFilterInit cv::cv_hal_rvv::sepFilter::sepFilterInit
-#define cv_hal_sepFilter cv::cv_hal_rvv::sepFilter::sepFilter
-#define cv_hal_sepFilterFree cv::cv_hal_rvv::sepFilter::sepFilterFree
-
-struct sepFilter2D
-{
-    int src_type;
-    int dst_type;
-    int kernel_type;
-    const uchar* kernelx_data;
-    int kernelx_length;
-    const uchar* kernely_data;
-    int kernely_length;
-    int anchor_x;
-    int anchor_y;
-    double delta;
-    int borderType;
-};
-
-inline int sepFilterInit(cvhalFilter2D **context, int src_type, int dst_type, int kernel_type, uchar* kernelx_data, int kernelx_length, uchar* kernely_data, int kernely_length, int anchor_x, int anchor_y, double delta, int borderType)
-{
-    if (kernel_type != CV_32FC1)
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-    if (src_type != CV_8UC1 && src_type != CV_16SC1 && src_type != CV_32FC1)
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-    if (dst_type != CV_16SC1 && dst_type != CV_32FC1)
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-    if ((kernelx_length != 3 && kernelx_length != 5) || kernelx_length != kernely_length)
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-    if ((borderType & ~BORDER_ISOLATED) == BORDER_WRAP)
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-
-    anchor_x = anchor_x < 0 ? kernelx_length / 2 : anchor_x;
-    anchor_y = anchor_y < 0 ? kernely_length / 2 : anchor_y;
-    *context = reinterpret_cast<cvhalFilter2D*>(new sepFilter2D{src_type, dst_type, kernel_type, kernelx_data, kernelx_length, kernely_data, kernely_length, anchor_x, anchor_y, delta, borderType & ~BORDER_ISOLATED});
-    return CV_HAL_ERROR_OK;
-}
-
-// the algorithm is copied from 3rdparty/carotene/src/separable_filter.hpp,
-// in the functor RowFilter3x3S16Generic and ColFilter3x3S16Generic
-template<int ksize, typename T>
-static inline int sepFilter(int start, int end, sepFilter2D* data, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int full_width, int full_height, int offset_x, int offset_y)
-{
-    constexpr int noval = std::numeric_limits<int>::max();
-    auto accessX = [&](int x) {
-        int pi;
-        if (data->borderType & BORDER_ISOLATED)
-        {
-            pi = filter::borderInterpolate(x - data->anchor_y, height, data->borderType & ~BORDER_ISOLATED);
-            pi = pi < 0 ? noval : pi;
-        }
-        else
-        {
-            pi = filter::borderInterpolate(offset_y + x - data->anchor_y, full_height, data->borderType);
-            pi = pi < 0 ? noval : pi - offset_y;
-        }
-        return pi;
-    };
-    auto accessY = [&](int y) {
-        int pj;
-        if (data->borderType & BORDER_ISOLATED)
-        {
-            pj = filter::borderInterpolate(y - data->anchor_x, width, data->borderType & ~BORDER_ISOLATED);
-            pj = pj < 0 ? noval : pj;
-        }
-        else
-        {
-            pj = filter::borderInterpolate(offset_x + y - data->anchor_x, full_width, data->borderType);
-            pj = pj < 0 ? noval : pj - offset_x;
-        }
-        return pj;
-    };
-    auto p2idx = [&](int x, int y){ return (x + ksize) % ksize * width + y; };
-
-    const float* kx = reinterpret_cast<const float*>(data->kernelx_data);
-    const float* ky = reinterpret_cast<const float*>(data->kernely_data);
-    std::vector<float> res(width * ksize);
-    auto process = [&](int x, int y) {
-        float sum = 0;
-        for (int i = 0; i < ksize; i++)
-        {
-            int p = accessY(y + i);
-            if (p != noval)
-            {
-                sum += kx[i] * reinterpret_cast<const T*>(src_data + x * src_step)[p];
-            }
-        }
-        res[p2idx(x, y)] = sum;
-    };
-
-    const int left = data->anchor_x, right = width - (ksize - 1 - data->anchor_x);
-    for (int i = start - data->anchor_y; i < end + (ksize - 1 - data->anchor_y); i++)
-    {
-        if (i + offset_y >= 0 && i + offset_y < full_height)
-        {
-            if (left >= right)
-            {
-                for (int j = 0; j < width; j++)
-                    process(i, j);
-            }
-            else
-            {
-                for (int j = 0; j < left; j++)
-                    process(i, j);
-                for (int j = right; j < width; j++)
-                    process(i, j);
-
-                int vl;
-                for (int j = left; j < right; j += vl)
-                {
-                    vl = __riscv_vsetvl_e8m2(right - j);
-                    const T* extra = reinterpret_cast<const T*>(src_data + i * src_step) + j - data->anchor_x;
-                    vfloat32m8_t src;
-                    if (std::is_same<T, uchar>::value)
-                    {
-                        src = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vle8_v_u8m2(reinterpret_cast<const uchar*>(extra), vl), vl), vl);
-                    }
-                    else if (std::is_same<T, short>::value)
-                    {
-                        src = __riscv_vfwcvt_f(__riscv_vle16_v_i16m4(reinterpret_cast<const short*>(extra), vl), vl);
-                    }
-                    else
-                    {
-                        src = __riscv_vle32_v_f32m8(reinterpret_cast<const float*>(extra), vl);
-                    }
-
-                    extra += vl;
-                    auto sum = __riscv_vfmul(src, kx[0], vl);
-                    src = __riscv_vfslide1down(src, extra[0], vl);
-                    sum = __riscv_vfmacc(sum, kx[1], src, vl);
-                    src = __riscv_vfslide1down(src, extra[1], vl);
-                    sum = __riscv_vfmacc(sum, kx[2], src, vl);
-                    if (ksize == 5)
-                    {
-                        src = __riscv_vfslide1down(src, extra[2], vl);
-                        sum = __riscv_vfmacc(sum, kx[3], src, vl);
-                        src = __riscv_vfslide1down(src, extra[3], vl);
-                        sum = __riscv_vfmacc(sum, kx[4], src, vl);
-                    }
-                    __riscv_vse32(res.data() + p2idx(i, j), sum, vl);
-                }
-            }
-        }
-
-        int cur = i - (ksize - 1 - data->anchor_y);
-        if (cur >= start)
-        {
-            const float* row0 = accessX(cur    ) == noval ? nullptr : res.data() + p2idx(accessX(cur    ), 0);
-            const float* row1 = accessX(cur + 1) == noval ? nullptr : res.data() + p2idx(accessX(cur + 1), 0);
-            const float* row2 = accessX(cur + 2) == noval ? nullptr : res.data() + p2idx(accessX(cur + 2), 0);
-            const float* row3 = nullptr, *row4 = nullptr;
-            if (ksize == 5)
-            {
-                row3 = accessX(cur + 3) == noval ? nullptr : res.data() + p2idx(accessX(cur + 3), 0);
-                row4 = accessX(cur + 4) == noval ? nullptr : res.data() + p2idx(accessX(cur + 4), 0);
-            }
-
-            int vl;
-            for (int j = 0; j < width; j += vl)
-            {
-                vl = __riscv_vsetvl_e32m4(width - j);
-                auto v0 = row0 ? __riscv_vle32_v_f32m4(row0 + j, vl) : __riscv_vfmv_v_f_f32m4(0, vl);
-                auto v1 = row1 ? __riscv_vle32_v_f32m4(row1 + j, vl) : __riscv_vfmv_v_f_f32m4(0, vl);
-                auto v2 = row2 ? __riscv_vle32_v_f32m4(row2 + j, vl) : __riscv_vfmv_v_f_f32m4(0, vl);
-                auto sum = __riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmv_v_f_f32m4(data->delta, vl), ky[0], v0, vl), ky[1], v1, vl), ky[2], v2, vl);
-
-                if (ksize == 5)
-                {
-                    auto v3 = row3 ? __riscv_vle32_v_f32m4(row3 + j, vl) : __riscv_vfmv_v_f_f32m4(0, vl);
-                    auto v4 = row4 ? __riscv_vle32_v_f32m4(row4 + j, vl) : __riscv_vfmv_v_f_f32m4(0, vl);
-                    sum = __riscv_vfmacc(__riscv_vfmacc(sum, ky[3], v3, vl), ky[4], v4, vl);
-                }
-
-                if (data->dst_type == CV_16SC1)
-                {
-                    __riscv_vse16(reinterpret_cast<short*>(dst_data + cur * dst_step) + j, __riscv_vfncvt_x(sum, vl), vl);
-                }
-                else
-                {
-                    __riscv_vse32(reinterpret_cast<float*>(dst_data + cur * dst_step) + j, sum, vl);
-                }
-            }
-        }
-    }
-
-    return CV_HAL_ERROR_OK;
-}
-
-inline int sepFilter(cvhalFilter2D *context, uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int full_width, int full_height, int offset_x, int offset_y)
-{
-    sepFilter2D* data = reinterpret_cast<sepFilter2D*>(context);
-
-    uchar* _dst_data = dst_data;
-    size_t _dst_step = dst_step;
-    const size_t size = CV_ELEM_SIZE(data->dst_type);
-    std::vector<uchar> dst;
-    if (src_data == _dst_data)
-    {
-        dst = std::vector<uchar>(width * height * size);
-        dst_data = dst.data();
-        dst_step = width * size;
-    }
-
-    int res = CV_HAL_ERROR_NOT_IMPLEMENTED;
-    switch (data->kernelx_length*100 + data->src_type)
-    {
-    case 300 + CV_8UC1:
-        res = filter::invoke(height, {sepFilter<3, uchar>}, data, src_data, src_step, dst_data, dst_step, width, height, full_width, full_height, offset_x, offset_y);
-        break;
-    case 500 + CV_8UC1:
-        res = filter::invoke(height, {sepFilter<5, uchar>}, data, src_data, src_step, dst_data, dst_step, width, height, full_width, full_height, offset_x, offset_y);
-        break;
-    case 300 + CV_16SC1:
-        res = filter::invoke(height, {sepFilter<3, short>}, data, src_data, src_step, dst_data, dst_step, width, height, full_width, full_height, offset_x, offset_y);
-        break;
-    case 500 + CV_16SC1:
-        res = filter::invoke(height, {sepFilter<5, short>}, data, src_data, src_step, dst_data, dst_step, width, height, full_width, full_height, offset_x, offset_y);
-        break;
-    case 300 + CV_32FC1:
-        res = filter::invoke(height, {sepFilter<3, float>}, data, src_data, src_step, dst_data, dst_step, width, height, full_width, full_height, offset_x, offset_y);
-        break;
-    case 500 + CV_32FC1:
-        res = filter::invoke(height, {sepFilter<5, float>}, data, src_data, src_step, dst_data, dst_step, width, height, full_width, full_height, offset_x, offset_y);
-        break;
-    }
-    if (res == CV_HAL_ERROR_NOT_IMPLEMENTED)
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-
-    if (src_data == _dst_data)
-    {
-        for (int i = 0; i < height; i++)
-            memcpy(_dst_data + i * _dst_step, dst.data() + i * dst_step, dst_step);
-    }
-
-    return res;
-}
-
-inline int sepFilterFree(cvhalFilter2D* context)
-{
-    delete reinterpret_cast<sepFilter2D*>(context);
-    return CV_HAL_ERROR_OK;
-}
-} // cv::cv_hal_rvv::sepFilter
-
-namespace morph {
-#undef cv_hal_morphInit
-#undef cv_hal_morph
-#undef cv_hal_morphFree
-#define cv_hal_morphInit cv::cv_hal_rvv::morph::morphInit
-#define cv_hal_morph cv::cv_hal_rvv::morph::morph
-#define cv_hal_morphFree cv::cv_hal_rvv::morph::morphFree
-
-struct Morph2D
-{
-    int operation;
-    int src_type;
-    int dst_type;
-    int kernel_type;
-    uchar* kernel_data;
-    size_t kernel_step;
-    int kernel_width;
-    int kernel_height;
-    int anchor_x;
-    int anchor_y;
-    int borderType;
-    const uchar* borderValue;
-};
-
-inline int morphInit(cvhalFilter2D** context, int operation, int src_type, int dst_type, int /*max_width*/, int /*max_height*/, int kernel_type, uchar* kernel_data, size_t kernel_step, int kernel_width, int kernel_height, int anchor_x, int anchor_y, int borderType, const double borderValue[4], int iterations, bool /*allowSubmatrix*/, bool /*allowInplace*/)
-{
-    if (kernel_type != CV_8UC1 || src_type != dst_type)
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-    if (src_type != CV_8UC1 && src_type != CV_8UC4)
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-    if (kernel_width != kernel_height || kernel_width != 3)
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-    if (iterations != 1)
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-    if (operation != CV_HAL_MORPH_ERODE && operation != CV_HAL_MORPH_DILATE)
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-    if ((borderType & ~BORDER_ISOLATED) == BORDER_WRAP)
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-
-    uchar* borderV;
-    if (src_type == CV_8UC1)
-    {
-        borderV = new uchar{static_cast<uchar>(borderValue[0])};
-        if (operation == CV_HAL_MORPH_DILATE && borderValue[0] == DBL_MAX)
-            borderV[0] = 0;
-    }
-    else
-    {
-        borderV = new uchar[4]{static_cast<uchar>(borderValue[0]), static_cast<uchar>(borderValue[1]), static_cast<uchar>(borderValue[2]), static_cast<uchar>(borderValue[3])};
-        if (operation == CV_HAL_MORPH_DILATE)
-        {
-            if (borderValue[0] == DBL_MAX)
-                borderV[0] = 0;
-            if (borderValue[1] == DBL_MAX)
-                borderV[1] = 0;
-            if (borderValue[2] == DBL_MAX)
-                borderV[2] = 0;
-            if (borderValue[3] == DBL_MAX)
-                borderV[3] = 0;
-        }
-    }
-
-    anchor_x = anchor_x < 0 ? kernel_width  / 2 : anchor_x;
-    anchor_y = anchor_y < 0 ? kernel_height / 2 : anchor_y;
-    *context = reinterpret_cast<cvhalFilter2D*>(new Morph2D{operation, src_type, dst_type, kernel_type, kernel_data, kernel_step, kernel_width, kernel_height, anchor_x, anchor_y, borderType, borderV});
-    return CV_HAL_ERROR_OK;
-}
-
-template<int op> struct rvv;
-template<> struct rvv<CV_HAL_MORPH_ERODE>
-{
-    static inline uchar init() { return std::numeric_limits<uchar>::max(); }
-    static inline uchar mop(uchar a, uchar b) { return a < b ? a : b; }
-    static inline vuint8m4_t vop(vuint8m4_t a, vuint8m4_t b, size_t c) { return __riscv_vminu(a, b, c); }
-    static inline vuint8m4_t vop(vuint8m4_t a, uchar b, size_t c) { return __riscv_vminu(a, b, c); }
-};
-template<> struct rvv<CV_HAL_MORPH_DILATE>
-{
-    static inline uchar init() { return std::numeric_limits<uchar>::min(); }
-    static inline uchar mop(uchar a, uchar b) { return a > b ? a : b; }
-    static inline vuint8m4_t vop(vuint8m4_t a, vuint8m4_t b, size_t c) { return __riscv_vmaxu(a, b, c); }
-    static inline vuint8m4_t vop(vuint8m4_t a, uchar b, size_t c) { return __riscv_vmaxu(a, b, c); }
-};
-
-// the algorithm is copied from 3rdparty/carotene/src/morph.cpp,
-// in the function template void morph3x3
-template<int op>
-static inline int morph(int start, int end, Morph2D* data, const uchar* src_data, size_t src_step, uchar* dst_data, int width, int height, int full_width, int full_height, int offset_x, int offset_y)
-{
-    bool kernel[9];
-    for (int i = 0; i < 9; i++)
-    {
-        kernel[i] = data->kernel_data[(i / 3) * data->kernel_step + i % 3] != 0;
-    }
-
-    constexpr int noval = std::numeric_limits<int>::max();
-    auto access = [&](int x, int y) {
-        int pi, pj;
-        if (data->borderType & BORDER_ISOLATED)
-        {
-            pi = filter::borderInterpolate(x - data->anchor_y, height, data->borderType & ~BORDER_ISOLATED);
-            pj = filter::borderInterpolate(y - data->anchor_x, width , data->borderType & ~BORDER_ISOLATED);
-            pi = pi < 0 ? noval : pi;
-            pj = pj < 0 ? noval : pj;
-        }
-        else
-        {
-            pi = filter::borderInterpolate(offset_y + x - data->anchor_y, full_height, data->borderType);
-            pj = filter::borderInterpolate(offset_x + y - data->anchor_x, full_width , data->borderType);
-            pi = pi < 0 ? noval : pi - offset_y;
-            pj = pj < 0 ? noval : pj - offset_x;
-        }
-        return std::make_pair(pi, pj);
-    };
-
-    auto process = [&](int x, int y) {
-        if (data->src_type == CV_8UC1)
-        {
-            uchar val = rvv<op>::init();
-            for (int i = 0; i < 9; i++)
-            {
-                if (kernel[i])
-                {
-                    auto p = access(x + i / 3, y + i % 3);
-                    if (p.first != noval && p.second != noval)
-                    {
-                        val = rvv<op>::mop(val, src_data[p.first * src_step + p.second]);
-                    }
-                    else
-                    {
-                        val = rvv<op>::mop(val, data->borderValue[0]);
-                    }
-                }
-            }
-            dst_data[x * width + y] = val;
-        }
-        else
-        {
-            uchar val0, val1, val2, val3;
-            val0 = val1 = val2 = val3 = rvv<op>::init();
-            for (int i = 0; i < 9; i++)
-            {
-                if (kernel[i])
-                {
-                    auto p = access(x + i / 3, y + i % 3);
-                    if (p.first != noval && p.second != noval)
-                    {
-                        val0 = rvv<op>::mop(val0, src_data[p.first * src_step + p.second * 4    ]);
-                        val1 = rvv<op>::mop(val1, src_data[p.first * src_step + p.second * 4 + 1]);
-                        val2 = rvv<op>::mop(val2, src_data[p.first * src_step + p.second * 4 + 2]);
-                        val3 = rvv<op>::mop(val3, src_data[p.first * src_step + p.second * 4 + 3]);
-                    }
-                    else
-                    {
-                        val0 = rvv<op>::mop(val0, data->borderValue[0]);
-                        val1 = rvv<op>::mop(val1, data->borderValue[1]);
-                        val2 = rvv<op>::mop(val2, data->borderValue[2]);
-                        val3 = rvv<op>::mop(val3, data->borderValue[3]);
-                    }
-                }
-            }
-            dst_data[(x * width + y) * 4    ] = val0;
-            dst_data[(x * width + y) * 4 + 1] = val1;
-            dst_data[(x * width + y) * 4 + 2] = val2;
-            dst_data[(x * width + y) * 4 + 3] = val3;
-        }
-    };
-
-    const int left = data->anchor_x, right = width - (2 - data->anchor_x);
-    for (int i = start; i < end; i++)
-    {
-        if (left >= right)
-        {
-            for (int j = 0; j < width; j++)
-                process(i, j);
-        }
-        else
-        {
-            for (int j = 0; j < left; j++)
-                process(i, j);
-            for (int j = right; j < width; j++)
-                process(i, j);
-
-            const uchar* row0 = access(i    , 0).first == noval ? nullptr : src_data + access(i    , 0).first * src_step;
-            const uchar* row1 = access(i + 1, 0).first == noval ? nullptr : src_data + access(i + 1, 0).first * src_step;
-            const uchar* row2 = access(i + 2, 0).first == noval ? nullptr : src_data + access(i + 2, 0).first * src_step;
-            if (data->src_type == CV_8UC1)
-            {
-                int vl;
-                for (int j = left; j < right; j += vl)
-                {
-                    vl = __riscv_vsetvl_e8m4(right - j);
-                    auto m0 = __riscv_vmv_v_x_u8m4(rvv<op>::init(), vl);
-                    auto loadsrc = [&](const uchar* row, bool k0, bool k1, bool k2) {
-                        if (!row)
-                        {
-                            m0 = rvv<op>::vop(m0, data->borderValue[0], vl);
-                            return;
-                        }
-
-                        const uchar* extra = row + j - data->anchor_x;
-                        auto v0 = __riscv_vle8_v_u8m4(extra, vl);
-
-                        if (k0) m0 = rvv<op>::vop(m0, v0, vl);
-                        v0 = __riscv_vslide1down(v0, extra[vl], vl);
-                        if (k1) m0 = rvv<op>::vop(m0, v0, vl);
-                        if (!k2) return;
-                        v0 = __riscv_vslide1down(v0, extra[vl + 1], vl);
-                        m0 = rvv<op>::vop(m0, v0, vl);
-                    };
-
-                    loadsrc(row0, kernel[0], kernel[1], kernel[2]);
-                    loadsrc(row1, kernel[3], kernel[4], kernel[5]);
-                    loadsrc(row2, kernel[6], kernel[7], kernel[8]);
-                    __riscv_vse8(dst_data + i * width + j, m0, vl);
-                }
-            }
-            else
-            {
-                int vl, vl0, vl1;
-                for (int j = left; j < right; j += vl)
-                {
-                    vl = __riscv_vsetvl_e8m4(right - j);
-                    vl0 = std::min(vl, (int)__riscv_vlenb() * 2);
-                    vl1 = vl - vl0;
-                    auto m0 = __riscv_vmv_v_x_u8m4(rvv<op>::init(), vl);
-                    auto m1 = __riscv_vmv_v_x_u8m4(rvv<op>::init(), vl);
-                    auto m2 = __riscv_vmv_v_x_u8m4(rvv<op>::init(), vl);
-                    auto m3 = __riscv_vmv_v_x_u8m4(rvv<op>::init(), vl);
-
-                    auto opshift = [&](vuint8m4_t a, vuint8m4_t b, bool k0, bool k1, bool k2, uchar r1, uchar r2) {
-                        if (k0) a = rvv<op>::vop(a, b, vl);
-                        b = __riscv_vslide1down(b, r1, vl);
-                        if (k1) a = rvv<op>::vop(a, b, vl);
-                        if (!k2) return a;
-                        b = __riscv_vslide1down(b, r2, vl);
-                        return rvv<op>::vop(a, b, vl);
-                    };
-                    auto loadsrc = [&](const uchar* row, bool k0, bool k1, bool k2) {
-                        if (!row)
-                        {
-                            m0 = rvv<op>::vop(m0, data->borderValue[0], vl);
-                            m1 = rvv<op>::vop(m1, data->borderValue[1], vl);
-                            m2 = rvv<op>::vop(m2, data->borderValue[2], vl);
-                            m3 = rvv<op>::vop(m3, data->borderValue[3], vl);
-                            return;
-                        }
-
-                        vuint8m4_t v0{}, v1{}, v2{}, v3{};
-                        const uchar* extra = row + (j - data->anchor_x) * 4;
-                        auto src = __riscv_vlseg4e8_v_u8m2x4(extra, vl0);
-                        v0 = __riscv_vset_v_u8m2_u8m4(v0, 0, __riscv_vget_v_u8m2x4_u8m2(src, 0));
-                        v1 = __riscv_vset_v_u8m2_u8m4(v1, 0, __riscv_vget_v_u8m2x4_u8m2(src, 1));
-                        v2 = __riscv_vset_v_u8m2_u8m4(v2, 0, __riscv_vget_v_u8m2x4_u8m2(src, 2));
-                        v3 = __riscv_vset_v_u8m2_u8m4(v3, 0, __riscv_vget_v_u8m2x4_u8m2(src, 3));
-                        src = __riscv_vlseg4e8_v_u8m2x4(extra + vl0 * 4, vl1);
-                        v0 = __riscv_vset_v_u8m2_u8m4(v0, 1, __riscv_vget_v_u8m2x4_u8m2(src, 0));
-                        v1 = __riscv_vset_v_u8m2_u8m4(v1, 1, __riscv_vget_v_u8m2x4_u8m2(src, 1));
-                        v2 = __riscv_vset_v_u8m2_u8m4(v2, 1, __riscv_vget_v_u8m2x4_u8m2(src, 2));
-                        v3 = __riscv_vset_v_u8m2_u8m4(v3, 1, __riscv_vget_v_u8m2x4_u8m2(src, 3));
-
-                        extra += vl * 4;
-                        m0 = opshift(m0, v0, k0, k1, k2, extra[0], extra[4]);
-                        m1 = opshift(m1, v1, k0, k1, k2, extra[1], extra[5]);
-                        m2 = opshift(m2, v2, k0, k1, k2, extra[2], extra[6]);
-                        m3 = opshift(m3, v3, k0, k1, k2, extra[3], extra[7]);
-                    };
-
-                    loadsrc(row0, kernel[0], kernel[1], kernel[2]);
-                    loadsrc(row1, kernel[3], kernel[4], kernel[5]);
-                    loadsrc(row2, kernel[6], kernel[7], kernel[8]);
-                    vuint8m2x4_t val{};
-                    val = __riscv_vset_v_u8m2_u8m2x4(val, 0, __riscv_vget_v_u8m4_u8m2(m0, 0));
-                    val = __riscv_vset_v_u8m2_u8m2x4(val, 1, __riscv_vget_v_u8m4_u8m2(m1, 0));
-                    val = __riscv_vset_v_u8m2_u8m2x4(val, 2, __riscv_vget_v_u8m4_u8m2(m2, 0));
-                    val = __riscv_vset_v_u8m2_u8m2x4(val, 3, __riscv_vget_v_u8m4_u8m2(m3, 0));
-                    __riscv_vsseg4e8(dst_data + (i * width + j) * 4, val, vl0);
-                    val = __riscv_vset_v_u8m2_u8m2x4(val, 0, __riscv_vget_v_u8m4_u8m2(m0, 1));
-                    val = __riscv_vset_v_u8m2_u8m2x4(val, 1, __riscv_vget_v_u8m4_u8m2(m1, 1));
-                    val = __riscv_vset_v_u8m2_u8m2x4(val, 2, __riscv_vget_v_u8m4_u8m2(m2, 1));
-                    val = __riscv_vset_v_u8m2_u8m2x4(val, 3, __riscv_vget_v_u8m4_u8m2(m3, 1));
-                    __riscv_vsseg4e8(dst_data + (i * width + j + vl0) * 4, val, vl1);
-                }
-            }
-        }
-    }
-
-    return CV_HAL_ERROR_OK;
-}
-
-inline int morph(cvhalFilter2D* context, uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int src_full_width, int src_full_height, int src_roi_x, int src_roi_y, int /*dst_full_width*/, int /*dst_full_height*/, int /*dst_roi_x*/, int /*dst_roi_y*/)
-{
-    Morph2D* data = reinterpret_cast<Morph2D*>(context);
-    int cn = data->src_type == CV_8UC1 ? 1 : 4;
-    std::vector<uchar> dst(width * height * cn);
-
-    int res = CV_HAL_ERROR_NOT_IMPLEMENTED;
-    switch (data->operation)
-    {
-    case CV_HAL_MORPH_ERODE:
-        res = filter::invoke(height, {morph<CV_HAL_MORPH_ERODE>}, data, src_data, src_step, dst.data(), width, height, src_full_width, src_full_height, src_roi_x, src_roi_y);
-        break;
-    case CV_HAL_MORPH_DILATE:
-        res = filter::invoke(height, {morph<CV_HAL_MORPH_DILATE>}, data, src_data, src_step, dst.data(), width, height, src_full_width, src_full_height, src_roi_x, src_roi_y);
-        break;
-    }
-
-    for (int i = 0; i < height; i++)
-        memcpy(dst_data + i * dst_step, dst.data() + i * width * cn, width * cn);
-    return res;
-}
-
-inline int morphFree(cvhalFilter2D* context)
-{
-    delete reinterpret_cast<Morph2D*>(context)->borderValue;
-    delete reinterpret_cast<Morph2D*>(context);
-    return CV_HAL_ERROR_OK;
-}
-} // cv::cv_hal_rvv::morph
-
-namespace gaussianBlurBinomial {
-#undef cv_hal_gaussianBlurBinomial
-#define cv_hal_gaussianBlurBinomial cv::cv_hal_rvv::gaussianBlurBinomial::gaussianBlurBinomial
-
-// the algorithm is same as cv_hal_sepFilter
-template<int ksize, typename helperT, typename helperWT>
-static inline int gaussianBlurC1(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int full_width, int full_height, int offset_x, int offset_y, int border_type)
-{
-    using T = typename helperT::ElemType;
-    using WT = typename helperWT::ElemType;
-
-    constexpr int noval = std::numeric_limits<int>::max();
-    auto accessX = [&](int x) {
-        int pi = filter::borderInterpolate(offset_y + x - ksize / 2, full_height, border_type);
-        return pi < 0 ? noval : pi - offset_y;
-    };
-    auto accessY = [&](int y) {
-        int pj = filter::borderInterpolate(offset_x + y - ksize / 2, full_width, border_type);
-        return pj < 0 ? noval : pj - offset_x;
-    };
-    auto p2idx = [&](int x, int y){ return (x + ksize) % ksize * width + y; };
-
-    constexpr uint kernel[2][5] = {{1, 2, 1}, {1, 4, 6, 4, 1}};
-    std::vector<WT> res(width * ksize);
-    auto process = [&](int x, int y) {
-        WT sum = 0;
-        for (int i = 0; i < ksize; i++)
-        {
-            int p = accessY(y + i);
-            if (p != noval)
-            {
-                sum += kernel[ksize == 5][i] * static_cast<WT>(reinterpret_cast<const T*>(src_data + x * src_step)[p]);
-            }
-        }
-        res[p2idx(x, y)] = sum;
-    };
-
-    const int left = ksize / 2, right = width - ksize / 2;
-    for (int i = start - ksize / 2; i < end + ksize / 2; i++)
-    {
-        if (i + offset_y >= 0 && i + offset_y < full_height)
-        {
-            if (left >= right)
-            {
-                for (int j = 0; j < width; j++)
-                    process(i, j);
-            }
-            else
-            {
-                for (int j = 0; j < left; j++)
-                    process(i, j);
-                for (int j = right; j < width; j++)
-                    process(i, j);
-
-                int vl;
-                for (int j = left; j < right; j += vl)
-                {
-                    vl = helperT::setvl(right - j);
-                    const T* extra = reinterpret_cast<const T*>(src_data + i * src_step) + j - ksize / 2;
-                    auto src = __riscv_vzext_vf2(helperT::vload(extra, vl), vl);
-
-                    extra += vl;
-                    auto sum = src;
-                    if (ksize == 3)
-                    {
-                        src = __riscv_vslide1down(src, extra[0], vl);
-                        sum = __riscv_vadd(sum, __riscv_vsll(src, 1, vl), vl);
-                        src = __riscv_vslide1down(src, extra[1], vl);
-                        sum = __riscv_vadd(sum, src, vl);
-                    }
-                    else
-                    {
-                        src = __riscv_vslide1down(src, extra[0], vl);
-                        sum = __riscv_vadd(sum, __riscv_vsll(src, 2, vl), vl);
-                        src = __riscv_vslide1down(src, extra[1], vl);
-                        sum = __riscv_vadd(sum, __riscv_vadd(__riscv_vsll(src, 1, vl), __riscv_vsll(src, 2, vl), vl), vl);
-                        src = __riscv_vslide1down(src, extra[2], vl);
-                        sum = __riscv_vadd(sum, __riscv_vsll(src, 2, vl), vl);
-                        src = __riscv_vslide1down(src, extra[3], vl);
-                        sum = __riscv_vadd(sum, src, vl);
-                    }
-                    helperWT::vstore(res.data() + p2idx(i, j), sum, vl);
-                }
-            }
-        }
-
-        int cur = i - ksize / 2;
-        if (cur >= start)
-        {
-            const WT* row0 = accessX(cur    ) == noval ? nullptr : res.data() + p2idx(accessX(cur    ), 0);
-            const WT* row1 = accessX(cur + 1) == noval ? nullptr : res.data() + p2idx(accessX(cur + 1), 0);
-            const WT* row2 = accessX(cur + 2) == noval ? nullptr : res.data() + p2idx(accessX(cur + 2), 0);
-            const WT* row3 = nullptr, *row4 = nullptr;
-            if (ksize == 5)
-            {
-                row3 = accessX(cur + 3) == noval ? nullptr : res.data() + p2idx(accessX(cur + 3), 0);
-                row4 = accessX(cur + 4) == noval ? nullptr : res.data() + p2idx(accessX(cur + 4), 0);
-            }
-
-            int vl;
-            for (int j = 0; j < width; j += vl)
-            {
-                vl = helperWT::setvl(width - j);
-                auto v0 = row0 ? helperWT::vload(row0 + j, vl) : helperWT::vmv(0, vl);
-                auto v1 = row1 ? helperWT::vload(row1 + j, vl) : helperWT::vmv(0, vl);
-                auto v2 = row2 ? helperWT::vload(row2 + j, vl) : helperWT::vmv(0, vl);
-                typename helperWT::VecType sum;
-                if (ksize == 3)
-                {
-                    sum = __riscv_vadd(__riscv_vadd(v0, v2, vl), __riscv_vsll(v1, 1, vl), vl);
-                }
-                else
-                {
-                    sum = __riscv_vadd(v0, __riscv_vadd(__riscv_vsll(v2, 1, vl), __riscv_vsll(v2, 2, vl), vl), vl);
-                    auto v3 = row3 ? helperWT::vload(row3 + j, vl) : helperWT::vmv(0, vl);
-                    sum = __riscv_vadd(sum, __riscv_vsll(__riscv_vadd(v1, v3, vl), 2, vl), vl);
-                    auto v4 = row4 ? helperWT::vload(row4 + j, vl) : helperWT::vmv(0, vl);
-                    sum = __riscv_vadd(sum, v4, vl);
-                }
-                helperT::vstore(reinterpret_cast<T*>(dst_data + cur * dst_step) + j, __riscv_vnclipu(sum, ksize == 5 ? 8 : 4, __RISCV_VXRM_RNU, vl), vl);
-            }
-        }
-    }
-
-    return CV_HAL_ERROR_OK;
-}
-
-template<int ksize>
-static inline int gaussianBlurC4(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int full_width, int full_height, int offset_x, int offset_y, int border_type)
-{
-    constexpr int noval = std::numeric_limits<int>::max();
-    auto accessX = [&](int x) {
-        int pi = filter::borderInterpolate(offset_y + x - ksize / 2, full_height, border_type);
-        return pi < 0 ? noval : pi - offset_y;
-    };
-    auto accessY = [&](int y) {
-        int pj = filter::borderInterpolate(offset_x + y - ksize / 2, full_width, border_type);
-        return pj < 0 ? noval : pj - offset_x;
-    };
-    auto p2idx = [&](int x, int y){ return ((x + ksize) % ksize * width + y) * 4; };
-
-    constexpr uint kernel[2][5] = {{1, 2, 1}, {1, 4, 6, 4, 1}};
-    std::vector<ushort> res(width * ksize * 4);
-    auto process = [&](int x, int y) {
-        ushort sum0, sum1, sum2, sum3;
-        sum0 = sum1 = sum2 = sum3 = 0;
-        for (int i = 0; i < ksize; i++)
-        {
-            int p = accessY(y + i);
-            if (p != noval)
-            {
-                sum0 += kernel[ksize == 5][i] * static_cast<ushort>((src_data + x * src_step)[p * 4    ]);
-                sum1 += kernel[ksize == 5][i] * static_cast<ushort>((src_data + x * src_step)[p * 4 + 1]);
-                sum2 += kernel[ksize == 5][i] * static_cast<ushort>((src_data + x * src_step)[p * 4 + 2]);
-                sum3 += kernel[ksize == 5][i] * static_cast<ushort>((src_data + x * src_step)[p * 4 + 3]);
-            }
-        }
-        res[p2idx(x, y)    ] = sum0;
-        res[p2idx(x, y) + 1] = sum1;
-        res[p2idx(x, y) + 2] = sum2;
-        res[p2idx(x, y) + 3] = sum3;
-    };
-
-    const int left = ksize / 2, right = width - ksize / 2;
-    for (int i = start - ksize / 2; i < end + ksize / 2; i++)
-    {
-        if (i + offset_y >= 0 && i + offset_y < full_height)
-        {
-            if (left >= right)
-            {
-                for (int j = 0; j < width; j++)
-                    process(i, j);
-            }
-            else
-            {
-                for (int j = 0; j < left; j++)
-                    process(i, j);
-                for (int j = right; j < width; j++)
-                    process(i, j);
-
-                int vl;
-                for (int j = left; j < right; j += vl)
-                {
-                    vl = __riscv_vsetvl_e8m1(right - j);
-                    const uchar* extra = src_data + i * src_step + (j - ksize / 2) * 4;
-                    auto src = __riscv_vlseg4e8_v_u8m1x4(extra, vl);
-                    auto src0 = __riscv_vzext_vf2(__riscv_vget_v_u8m1x4_u8m1(src, 0), vl);
-                    auto src1 = __riscv_vzext_vf2(__riscv_vget_v_u8m1x4_u8m1(src, 1), vl);
-                    auto src2 = __riscv_vzext_vf2(__riscv_vget_v_u8m1x4_u8m1(src, 2), vl);
-                    auto src3 = __riscv_vzext_vf2(__riscv_vget_v_u8m1x4_u8m1(src, 3), vl);
-
-                    extra += vl * 4;
-                    auto sum0 = src0, sum1 = src1, sum2 = src2, sum3 = src3;
-                    if (ksize == 3)
-                    {
-                        src0 = __riscv_vslide1down(src0, extra[0], vl);
-                        src1 = __riscv_vslide1down(src1, extra[1], vl);
-                        src2 = __riscv_vslide1down(src2, extra[2], vl);
-                        src3 = __riscv_vslide1down(src3, extra[3], vl);
-                        sum0 = __riscv_vadd(sum0, __riscv_vsll(src0, 1, vl), vl);
-                        sum1 = __riscv_vadd(sum1, __riscv_vsll(src1, 1, vl), vl);
-                        sum2 = __riscv_vadd(sum2, __riscv_vsll(src2, 1, vl), vl);
-                        sum3 = __riscv_vadd(sum3, __riscv_vsll(src3, 1, vl), vl);
-                        src0 = __riscv_vslide1down(src0, extra[4], vl);
-                        src1 = __riscv_vslide1down(src1, extra[5], vl);
-                        src2 = __riscv_vslide1down(src2, extra[6], vl);
-                        src3 = __riscv_vslide1down(src3, extra[7], vl);
-                        sum0 = __riscv_vadd(sum0, src0, vl);
-                        sum1 = __riscv_vadd(sum1, src1, vl);
-                        sum2 = __riscv_vadd(sum2, src2, vl);
-                        sum3 = __riscv_vadd(sum3, src3, vl);
-                    }
-                    else
-                    {
-                        src0 = __riscv_vslide1down(src0, extra[0], vl);
-                        src1 = __riscv_vslide1down(src1, extra[1], vl);
-                        src2 = __riscv_vslide1down(src2, extra[2], vl);
-                        src3 = __riscv_vslide1down(src3, extra[3], vl);
-                        sum0 = __riscv_vadd(sum0, __riscv_vsll(src0, 2, vl), vl);
-                        sum1 = __riscv_vadd(sum1, __riscv_vsll(src1, 2, vl), vl);
-                        sum2 = __riscv_vadd(sum2, __riscv_vsll(src2, 2, vl), vl);
-                        sum3 = __riscv_vadd(sum3, __riscv_vsll(src3, 2, vl), vl);
-                        src0 = __riscv_vslide1down(src0, extra[4], vl);
-                        src1 = __riscv_vslide1down(src1, extra[5], vl);
-                        src2 = __riscv_vslide1down(src2, extra[6], vl);
-                        src3 = __riscv_vslide1down(src3, extra[7], vl);
-                        sum0 = __riscv_vadd(sum0, __riscv_vadd(__riscv_vsll(src0, 1, vl), __riscv_vsll(src0, 2, vl), vl), vl);
-                        sum1 = __riscv_vadd(sum1, __riscv_vadd(__riscv_vsll(src1, 1, vl), __riscv_vsll(src1, 2, vl), vl), vl);
-                        sum2 = __riscv_vadd(sum2, __riscv_vadd(__riscv_vsll(src2, 1, vl), __riscv_vsll(src2, 2, vl), vl), vl);
-                        sum3 = __riscv_vadd(sum3, __riscv_vadd(__riscv_vsll(src3, 1, vl), __riscv_vsll(src3, 2, vl), vl), vl);
-                        src0 = __riscv_vslide1down(src0, extra[ 8], vl);
-                        src1 = __riscv_vslide1down(src1, extra[ 9], vl);
-                        src2 = __riscv_vslide1down(src2, extra[10], vl);
-                        src3 = __riscv_vslide1down(src3, extra[11], vl);
-                        sum0 = __riscv_vadd(sum0, __riscv_vsll(src0, 2, vl), vl);
-                        sum1 = __riscv_vadd(sum1, __riscv_vsll(src1, 2, vl), vl);
-                        sum2 = __riscv_vadd(sum2, __riscv_vsll(src2, 2, vl), vl);
-                        sum3 = __riscv_vadd(sum3, __riscv_vsll(src3, 2, vl), vl);
-                        src0 = __riscv_vslide1down(src0, extra[12], vl);
-                        src1 = __riscv_vslide1down(src1, extra[13], vl);
-                        src2 = __riscv_vslide1down(src2, extra[14], vl);
-                        src3 = __riscv_vslide1down(src3, extra[15], vl);
-                        sum0 = __riscv_vadd(sum0, src0, vl);
-                        sum1 = __riscv_vadd(sum1, src1, vl);
-                        sum2 = __riscv_vadd(sum2, src2, vl);
-                        sum3 = __riscv_vadd(sum3, src3, vl);
-                    }
-
-                    vuint16m2x4_t dst{};
-                    dst = __riscv_vset_v_u16m2_u16m2x4(dst, 0, sum0);
-                    dst = __riscv_vset_v_u16m2_u16m2x4(dst, 1, sum1);
-                    dst = __riscv_vset_v_u16m2_u16m2x4(dst, 2, sum2);
-                    dst = __riscv_vset_v_u16m2_u16m2x4(dst, 3, sum3);
-                    __riscv_vsseg4e16(res.data() + p2idx(i, j), dst, vl);
-                }
-            }
-        }
-
-        int cur = i - ksize / 2;
-        if (cur >= start)
-        {
-            const ushort* row0 = accessX(cur    ) == noval ? nullptr : res.data() + p2idx(accessX(cur    ), 0);
-            const ushort* row1 = accessX(cur + 1) == noval ? nullptr : res.data() + p2idx(accessX(cur + 1), 0);
-            const ushort* row2 = accessX(cur + 2) == noval ? nullptr : res.data() + p2idx(accessX(cur + 2), 0);
-            const ushort* row3 = nullptr, *row4 = nullptr;
-            if (ksize == 5)
-            {
-                row3 = accessX(cur + 3) == noval ? nullptr : res.data() + p2idx(accessX(cur + 3), 0);
-                row4 = accessX(cur + 4) == noval ? nullptr : res.data() + p2idx(accessX(cur + 4), 0);
-            }
-
-            int vl;
-            for (int j = 0; j < width; j += vl)
-            {
-                vl = __riscv_vsetvl_e16m2(width - j);
-                vuint16m2_t sum0, sum1, sum2, sum3, src0{}, src1{}, src2{}, src3{};
-                sum0 = sum1 = sum2 = sum3 = __riscv_vmv_v_x_u16m2(0, vl);
-
-                auto loadres = [&](const ushort* row) {
-                    auto src = __riscv_vlseg4e16_v_u16m2x4(row + j * 4, vl);
-                    src0 = __riscv_vget_v_u16m2x4_u16m2(src, 0);
-                    src1 = __riscv_vget_v_u16m2x4_u16m2(src, 1);
-                    src2 = __riscv_vget_v_u16m2x4_u16m2(src, 2);
-                    src3 = __riscv_vget_v_u16m2x4_u16m2(src, 3);
-                };
-                if (row0)
-                {
-                    loadres(row0);
-                    sum0 = src0;
-                    sum1 = src1;
-                    sum2 = src2;
-                    sum3 = src3;
-                }
-                if (row1)
-                {
-                    loadres(row1);
-                    sum0 = __riscv_vadd(sum0, __riscv_vsll(src0, ksize == 5 ? 2 : 1, vl), vl);
-                    sum1 = __riscv_vadd(sum1, __riscv_vsll(src1, ksize == 5 ? 2 : 1, vl), vl);
-                    sum2 = __riscv_vadd(sum2, __riscv_vsll(src2, ksize == 5 ? 2 : 1, vl), vl);
-                    sum3 = __riscv_vadd(sum3, __riscv_vsll(src3, ksize == 5 ? 2 : 1, vl), vl);
-                }
-                if (row2)
-                {
-                    loadres(row2);
-                    if (ksize == 5)
-                    {
-                        src0 = __riscv_vadd(__riscv_vsll(src0, 1, vl), __riscv_vsll(src0, 2, vl), vl);
-                        src1 = __riscv_vadd(__riscv_vsll(src1, 1, vl), __riscv_vsll(src1, 2, vl), vl);
-                        src2 = __riscv_vadd(__riscv_vsll(src2, 1, vl), __riscv_vsll(src2, 2, vl), vl);
-                        src3 = __riscv_vadd(__riscv_vsll(src3, 1, vl), __riscv_vsll(src3, 2, vl), vl);
-                    }
-                    sum0 = __riscv_vadd(sum0, src0, vl);
-                    sum1 = __riscv_vadd(sum1, src1, vl);
-                    sum2 = __riscv_vadd(sum2, src2, vl);
-                    sum3 = __riscv_vadd(sum3, src3, vl);
-                }
-                if (row3)
-                {
-                    loadres(row3);
-                    sum0 = __riscv_vadd(sum0, __riscv_vsll(src0, 2, vl), vl);
-                    sum1 = __riscv_vadd(sum1, __riscv_vsll(src1, 2, vl), vl);
-                    sum2 = __riscv_vadd(sum2, __riscv_vsll(src2, 2, vl), vl);
-                    sum3 = __riscv_vadd(sum3, __riscv_vsll(src3, 2, vl), vl);
-                }
-                if (row4)
-                {
-                    loadres(row4);
-                    sum0 = __riscv_vadd(sum0, src0, vl);
-                    sum1 = __riscv_vadd(sum1, src1, vl);
-                    sum2 = __riscv_vadd(sum2, src2, vl);
-                    sum3 = __riscv_vadd(sum3, src3, vl);
-                }
-
-                vuint8m1x4_t dst{};
-                dst = __riscv_vset_v_u8m1_u8m1x4(dst, 0, __riscv_vnclipu(sum0, ksize == 5 ? 8 : 4, __RISCV_VXRM_RNU, vl));
-                dst = __riscv_vset_v_u8m1_u8m1x4(dst, 1, __riscv_vnclipu(sum1, ksize == 5 ? 8 : 4, __RISCV_VXRM_RNU, vl));
-                dst = __riscv_vset_v_u8m1_u8m1x4(dst, 2, __riscv_vnclipu(sum2, ksize == 5 ? 8 : 4, __RISCV_VXRM_RNU, vl));
-                dst = __riscv_vset_v_u8m1_u8m1x4(dst, 3, __riscv_vnclipu(sum3, ksize == 5 ? 8 : 4, __RISCV_VXRM_RNU, vl));
-                __riscv_vsseg4e8(dst_data + cur * dst_step + j * 4, dst, vl);
-            }
-        }
-    }
-
-    return CV_HAL_ERROR_OK;
-}
-
-inline int gaussianBlurBinomial(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int depth, int cn, size_t margin_left, size_t margin_top, size_t margin_right, size_t margin_bottom, size_t ksize, int border_type)
-{
-    const int type = CV_MAKETYPE(depth, cn);
-    if ((type != CV_8UC1 && type != CV_8UC4 && type != CV_16UC1) || src_data == dst_data)
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-    if ((ksize != 3 && ksize != 5) || border_type & BORDER_ISOLATED || border_type == BORDER_WRAP)
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-
-    switch (ksize*100 + type)
-    {
-    case 300 + CV_8UC1:
-        return filter::invoke(height, {gaussianBlurC1<3, RVV_U8M4, RVV_U16M8>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, border_type);
-    case 500 + CV_8UC1:
-        return filter::invoke(height, {gaussianBlurC1<5, RVV_U8M4, RVV_U16M8>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, border_type);
-    case 300 + CV_16UC1:
-        return filter::invoke(height, {gaussianBlurC1<3, RVV_U16M4, RVV_U32M8>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, border_type);
-    case 500 + CV_16UC1:
-        return filter::invoke(height, {gaussianBlurC1<5, RVV_U16M4, RVV_U32M8>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, border_type);
-    case 300 + CV_8UC4:
-        return filter::invoke(height, {gaussianBlurC4<3>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, border_type);
-    case 500 + CV_8UC4:
-        return filter::invoke(height, {gaussianBlurC4<5>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, border_type);
-    }
-
-    return CV_HAL_ERROR_NOT_IMPLEMENTED;
-}
-} // cv::cv_hal_rvv::gaussianBlurBinomial
-
-namespace medianBlur {
-#undef cv_hal_medianBlur
-#define cv_hal_medianBlur cv::cv_hal_rvv::medianBlur::medianBlur
-
-// the algorithm is copied from imgproc/src/median_blur.simd.cpp
-// in the function template static void medianBlur_SortNet
-template<int ksize, typename helper>
-static inline int medianBlurC1(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height)
-{
-    using T = typename helper::ElemType;
-    using VT = typename helper::VecType;
-
-    for (int i = start; i < end; i++)
-    {
-        const T* row0 = reinterpret_cast<const T*>(src_data + std::min(std::max(i     - ksize / 2, 0), height - 1) * src_step);
-        const T* row1 = reinterpret_cast<const T*>(src_data + std::min(std::max(i + 1 - ksize / 2, 0), height - 1) * src_step);
-        const T* row2 = reinterpret_cast<const T*>(src_data + std::min(std::max(i + 2 - ksize / 2, 0), height - 1) * src_step);
-        const T* row3 = reinterpret_cast<const T*>(src_data + std::min(std::max(i + 3 - ksize / 2, 0), height - 1) * src_step);
-        const T* row4 = reinterpret_cast<const T*>(src_data + std::min(std::max(i + 4 - ksize / 2, 0), height - 1) * src_step);
-        int vl;
-        auto vop = [&vl](VT& a, VT& b) {
-            auto t = a;
-            a = helper::vmin(a, b, vl);
-            b = helper::vmax(t, b, vl);
-        };
-
-        for (int j = 0; j < width; j += vl)
-        {
-            vl = helper::setvl(width - j);
-            if (ksize == 3)
-            {
-                VT p0, p1, p2;
-                VT p3, p4, p5;
-                VT p6, p7, p8;
-                if (j != 0)
-                {
-                    p0 = helper::vload(row0 + j - 1, vl);
-                    p3 = helper::vload(row1 + j - 1, vl);
-                    p6 = helper::vload(row2 + j - 1, vl);
-                }
-                else
-                {
-                    p0 = helper::vslide1up(helper::vload(row0, vl), row0[0], vl);
-                    p3 = helper::vslide1up(helper::vload(row1, vl), row1[0], vl);
-                    p6 = helper::vslide1up(helper::vload(row2, vl), row2[0], vl);
-                }
-                p1 = helper::vslide1down(p0, row0[j + vl - 1], vl);
-                p4 = helper::vslide1down(p3, row1[j + vl - 1], vl);
-                p7 = helper::vslide1down(p6, row2[j + vl - 1], vl);
-                p2 = helper::vslide1down(p1, row0[std::min(width - 1, j + vl)], vl);
-                p5 = helper::vslide1down(p4, row1[std::min(width - 1, j + vl)], vl);
-                p8 = helper::vslide1down(p7, row2[std::min(width - 1, j + vl)], vl);
-
-                vop(p1, p2); vop(p4, p5); vop(p7, p8); vop(p0, p1);
-                vop(p3, p4); vop(p6, p7); vop(p1, p2); vop(p4, p5);
-                vop(p7, p8); vop(p0, p3); vop(p5, p8); vop(p4, p7);
-                vop(p3, p6); vop(p1, p4); vop(p2, p5); vop(p4, p7);
-                vop(p4, p2); vop(p6, p4); vop(p4, p2);
-                helper::vstore(reinterpret_cast<T*>(dst_data + i * dst_step) + j, p4, vl);
-            }
-            else
-            {
-                VT p0, p1, p2, p3, p4;
-                VT p5, p6, p7, p8, p9;
-                VT p10, p11, p12, p13, p14;
-                VT p15, p16, p17, p18, p19;
-                VT p20, p21, p22, p23, p24;
-                if (j >= 2)
-                {
-                    p0 = helper::vload(row0 + j - 2, vl);
-                    p5 = helper::vload(row1 + j - 2, vl);
-                    p10 = helper::vload(row2 + j - 2, vl);
-                    p15 = helper::vload(row3 + j - 2, vl);
-                    p20 = helper::vload(row4 + j - 2, vl);
-                }
-                else
-                {
-                    p0 = helper::vslide1up(helper::vload(row0, vl), row0[0], vl);
-                    p5 = helper::vslide1up(helper::vload(row1, vl), row1[0], vl);
-                    p10 = helper::vslide1up(helper::vload(row2, vl), row2[0], vl);
-                    p15 = helper::vslide1up(helper::vload(row3, vl), row3[0], vl);
-                    p20 = helper::vslide1up(helper::vload(row4, vl), row4[0], vl);
-                    if (j == 0)
-                    {
-                        p0 = helper::vslide1up(p0, row0[0], vl);
-                        p5 = helper::vslide1up(p5, row1[0], vl);
-                        p10 = helper::vslide1up(p10, row2[0], vl);
-                        p15 = helper::vslide1up(p15, row3[0], vl);
-                        p20 = helper::vslide1up(p20, row4[0], vl);
-                    }
-                }
-                p1 = helper::vslide1down(p0, row0[j + vl - 2], vl);
-                p6 = helper::vslide1down(p5, row1[j + vl - 2], vl);
-                p11 = helper::vslide1down(p10, row2[j + vl - 2], vl);
-                p16 = helper::vslide1down(p15, row3[j + vl - 2], vl);
-                p21 = helper::vslide1down(p20, row4[j + vl - 2], vl);
-                p2 = helper::vslide1down(p1, row0[j + vl - 1], vl);
-                p7 = helper::vslide1down(p6, row1[j + vl - 1], vl);
-                p12 = helper::vslide1down(p11, row2[j + vl - 1], vl);
-                p17 = helper::vslide1down(p16, row3[j + vl - 1], vl);
-                p22 = helper::vslide1down(p21, row4[j + vl - 1], vl);
-                p3 = helper::vslide1down(p2, row0[std::min(width - 1, j + vl)], vl);
-                p8 = helper::vslide1down(p7, row1[std::min(width - 1, j + vl)], vl);
-                p13 = helper::vslide1down(p12, row2[std::min(width - 1, j + vl)], vl);
-                p18 = helper::vslide1down(p17, row3[std::min(width - 1, j + vl)], vl);
-                p23 = helper::vslide1down(p22, row4[std::min(width - 1, j + vl)], vl);
-                p4 = helper::vslide1down(p3, row0[std::min(width - 1, j + vl + 1)], vl);
-                p9 = helper::vslide1down(p8, row1[std::min(width - 1, j + vl + 1)], vl);
-                p14 = helper::vslide1down(p13, row2[std::min(width - 1, j + vl + 1)], vl);
-                p19 = helper::vslide1down(p18, row3[std::min(width - 1, j + vl + 1)], vl);
-                p24 = helper::vslide1down(p23, row4[std::min(width - 1, j + vl + 1)], vl);
-
-                vop(p1, p2); vop(p0, p1); vop(p1, p2); vop(p4, p5); vop(p3, p4);
-                vop(p4, p5); vop(p0, p3); vop(p2, p5); vop(p2, p3); vop(p1, p4);
-                vop(p1, p2); vop(p3, p4); vop(p7, p8); vop(p6, p7); vop(p7, p8);
-                vop(p10, p11); vop(p9, p10); vop(p10, p11); vop(p6, p9); vop(p8, p11);
-                vop(p8, p9); vop(p7, p10); vop(p7, p8); vop(p9, p10); vop(p0, p6);
-                vop(p4, p10); vop(p4, p6); vop(p2, p8); vop(p2, p4); vop(p6, p8);
-                vop(p1, p7); vop(p5, p11); vop(p5, p7); vop(p3, p9); vop(p3, p5);
-                vop(p7, p9); vop(p1, p2); vop(p3, p4); vop(p5, p6); vop(p7, p8);
-                vop(p9, p10); vop(p13, p14); vop(p12, p13); vop(p13, p14); vop(p16, p17);
-                vop(p15, p16); vop(p16, p17); vop(p12, p15); vop(p14, p17); vop(p14, p15);
-                vop(p13, p16); vop(p13, p14); vop(p15, p16); vop(p19, p20); vop(p18, p19);
-                vop(p19, p20); vop(p21, p22); vop(p23, p24); vop(p21, p23); vop(p22, p24);
-                vop(p22, p23); vop(p18, p21); vop(p20, p23); vop(p20, p21); vop(p19, p22);
-                vop(p22, p24); vop(p19, p20); vop(p21, p22); vop(p23, p24); vop(p12, p18);
-                vop(p16, p22); vop(p16, p18); vop(p14, p20); vop(p20, p24); vop(p14, p16);
-                vop(p18, p20); vop(p22, p24); vop(p13, p19); vop(p17, p23); vop(p17, p19);
-                vop(p15, p21); vop(p15, p17); vop(p19, p21); vop(p13, p14); vop(p15, p16);
-                vop(p17, p18); vop(p19, p20); vop(p21, p22); vop(p23, p24); vop(p0, p12);
-                vop(p8, p20); vop(p8, p12); vop(p4, p16); vop(p16, p24); vop(p12, p16);
-                vop(p2, p14); vop(p10, p22); vop(p10, p14); vop(p6, p18); vop(p6, p10);
-                vop(p10, p12); vop(p1, p13); vop(p9, p21); vop(p9, p13); vop(p5, p17);
-                vop(p13, p17); vop(p3, p15); vop(p11, p23); vop(p11, p15); vop(p7, p19);
-                vop(p7, p11); vop(p11, p13); vop(p11, p12);
-                helper::vstore(reinterpret_cast<T*>(dst_data + i * dst_step) + j, p12, vl);
-            }
-        }
-    }
-
-    return CV_HAL_ERROR_OK;
-}
-
-template<int ksize>
-static inline int medianBlurC4(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height)
-{
-    for (int i = start; i < end; i++)
-    {
-        const uchar* row0 = src_data + std::min(std::max(i     - ksize / 2, 0), height - 1) * src_step;
-        const uchar* row1 = src_data + std::min(std::max(i + 1 - ksize / 2, 0), height - 1) * src_step;
-        const uchar* row2 = src_data + std::min(std::max(i + 2 - ksize / 2, 0), height - 1) * src_step;
-        const uchar* row3 = src_data + std::min(std::max(i + 3 - ksize / 2, 0), height - 1) * src_step;
-        const uchar* row4 = src_data + std::min(std::max(i + 4 - ksize / 2, 0), height - 1) * src_step;
-        int vl;
-        for (int j = 0; j < width; j += vl)
-        {
-            if (ksize == 3)
-            {
-                vl = __riscv_vsetvl_e8m1(width - j);
-                vuint8m1_t p00, p01, p02;
-                vuint8m1_t p03, p04, p05;
-                vuint8m1_t p06, p07, p08;
-                vuint8m1_t p10, p11, p12;
-                vuint8m1_t p13, p14, p15;
-                vuint8m1_t p16, p17, p18;
-                vuint8m1_t p20, p21, p22;
-                vuint8m1_t p23, p24, p25;
-                vuint8m1_t p26, p27, p28;
-                vuint8m1_t p30, p31, p32;
-                vuint8m1_t p33, p34, p35;
-                vuint8m1_t p36, p37, p38;
-                auto loadsrc = [&vl](const uchar* row, vuint8m1_t& p0, vuint8m1_t& p1, vuint8m1_t& p2, vuint8m1_t& p3) {
-                    auto src = __riscv_vlseg4e8_v_u8m1x4(row, vl);
-                    p0 = __riscv_vget_v_u8m1x4_u8m1(src, 0);
-                    p1 = __riscv_vget_v_u8m1x4_u8m1(src, 1);
-                    p2 = __riscv_vget_v_u8m1x4_u8m1(src, 2);
-                    p3 = __riscv_vget_v_u8m1x4_u8m1(src, 3);
-                };
-                if (j != 0)
-                {
-                    loadsrc(row0 + (j - 1) * 4, p00, p10, p20, p30);
-                    loadsrc(row1 + (j - 1) * 4, p03, p13, p23, p33);
-                    loadsrc(row2 + (j - 1) * 4, p06, p16, p26, p36);
-                }
-                else
-                {
-                    loadsrc(row0, p00, p10, p20, p30);
-                    loadsrc(row1, p03, p13, p23, p33);
-                    loadsrc(row2, p06, p16, p26, p36);
-                    p00 = __riscv_vslide1up(p00, row0[0], vl);
-                    p10 = __riscv_vslide1up(p10, row0[1], vl);
-                    p20 = __riscv_vslide1up(p20, row0[2], vl);
-                    p30 = __riscv_vslide1up(p30, row0[3], vl);
-                    p03 = __riscv_vslide1up(p03, row1[0], vl);
-                    p13 = __riscv_vslide1up(p13, row1[1], vl);
-                    p23 = __riscv_vslide1up(p23, row1[2], vl);
-                    p33 = __riscv_vslide1up(p33, row1[3], vl);
-                    p06 = __riscv_vslide1up(p06, row2[0], vl);
-                    p16 = __riscv_vslide1up(p16, row2[1], vl);
-                    p26 = __riscv_vslide1up(p26, row2[2], vl);
-                    p36 = __riscv_vslide1up(p36, row2[3], vl);
-                }
-                p01 = __riscv_vslide1down(p00, row0[(j + vl - 1) * 4    ], vl);
-                p11 = __riscv_vslide1down(p10, row0[(j + vl - 1) * 4 + 1], vl);
-                p21 = __riscv_vslide1down(p20, row0[(j + vl - 1) * 4 + 2], vl);
-                p31 = __riscv_vslide1down(p30, row0[(j + vl - 1) * 4 + 3], vl);
-                p04 = __riscv_vslide1down(p03, row1[(j + vl - 1) * 4    ], vl);
-                p14 = __riscv_vslide1down(p13, row1[(j + vl - 1) * 4 + 1], vl);
-                p24 = __riscv_vslide1down(p23, row1[(j + vl - 1) * 4 + 2], vl);
-                p34 = __riscv_vslide1down(p33, row1[(j + vl - 1) * 4 + 3], vl);
-                p07 = __riscv_vslide1down(p06, row2[(j + vl - 1) * 4    ], vl);
-                p17 = __riscv_vslide1down(p16, row2[(j + vl - 1) * 4 + 1], vl);
-                p27 = __riscv_vslide1down(p26, row2[(j + vl - 1) * 4 + 2], vl);
-                p37 = __riscv_vslide1down(p36, row2[(j + vl - 1) * 4 + 3], vl);
-                p02 = __riscv_vslide1down(p01, row0[std::min(width - 1, j + vl) * 4    ], vl);
-                p12 = __riscv_vslide1down(p11, row0[std::min(width - 1, j + vl) * 4 + 1], vl);
-                p22 = __riscv_vslide1down(p21, row0[std::min(width - 1, j + vl) * 4 + 2], vl);
-                p32 = __riscv_vslide1down(p31, row0[std::min(width - 1, j + vl) * 4 + 3], vl);
-                p05 = __riscv_vslide1down(p04, row1[std::min(width - 1, j + vl) * 4    ], vl);
-                p15 = __riscv_vslide1down(p14, row1[std::min(width - 1, j + vl) * 4 + 1], vl);
-                p25 = __riscv_vslide1down(p24, row1[std::min(width - 1, j + vl) * 4 + 2], vl);
-                p35 = __riscv_vslide1down(p34, row1[std::min(width - 1, j + vl) * 4 + 3], vl);
-                p08 = __riscv_vslide1down(p07, row2[std::min(width - 1, j + vl) * 4    ], vl);
-                p18 = __riscv_vslide1down(p17, row2[std::min(width - 1, j + vl) * 4 + 1], vl);
-                p28 = __riscv_vslide1down(p27, row2[std::min(width - 1, j + vl) * 4 + 2], vl);
-                p38 = __riscv_vslide1down(p37, row2[std::min(width - 1, j + vl) * 4 + 3], vl);
-
-                auto vop = [&vl](vuint8m1_t& a, vuint8m1_t& b) {
-                    auto t = a;
-                    a = __riscv_vminu(a, b, vl);
-                    b = __riscv_vmaxu(t, b, vl);
-                };
-                vuint8m1x4_t dst{};
-                vop(p01, p02); vop(p04, p05); vop(p07, p08); vop(p00, p01);
-                vop(p03, p04); vop(p06, p07); vop(p01, p02); vop(p04, p05);
-                vop(p07, p08); vop(p00, p03); vop(p05, p08); vop(p04, p07);
-                vop(p03, p06); vop(p01, p04); vop(p02, p05); vop(p04, p07);
-                vop(p04, p02); vop(p06, p04); vop(p04, p02);
-                dst = __riscv_vset_v_u8m1_u8m1x4(dst, 0, p04);
-                vop(p11, p12); vop(p14, p15); vop(p17, p18); vop(p10, p11);
-                vop(p13, p14); vop(p16, p17); vop(p11, p12); vop(p14, p15);
-                vop(p17, p18); vop(p10, p13); vop(p15, p18); vop(p14, p17);
-                vop(p13, p16); vop(p11, p14); vop(p12, p15); vop(p14, p17);
-                vop(p14, p12); vop(p16, p14); vop(p14, p12);
-                dst = __riscv_vset_v_u8m1_u8m1x4(dst, 1, p14);
-                vop(p21, p22); vop(p24, p25); vop(p27, p28); vop(p20, p21);
-                vop(p23, p24); vop(p26, p27); vop(p21, p22); vop(p24, p25);
-                vop(p27, p28); vop(p20, p23); vop(p25, p28); vop(p24, p27);
-                vop(p23, p26); vop(p21, p24); vop(p22, p25); vop(p24, p27);
-                vop(p24, p22); vop(p26, p24); vop(p24, p22);
-                dst = __riscv_vset_v_u8m1_u8m1x4(dst, 2, p24);
-                vop(p31, p32); vop(p34, p35); vop(p37, p38); vop(p30, p31);
-                vop(p33, p34); vop(p36, p37); vop(p31, p32); vop(p34, p35);
-                vop(p37, p38); vop(p30, p33); vop(p35, p38); vop(p34, p37);
-                vop(p33, p36); vop(p31, p34); vop(p32, p35); vop(p34, p37);
-                vop(p34, p32); vop(p36, p34); vop(p34, p32);
-                dst = __riscv_vset_v_u8m1_u8m1x4(dst, 3, p34);
-                __riscv_vsseg4e8(dst_data + i * dst_step + j * 4, dst, vl);
-            }
-            else
-            {
-                vl = __riscv_vsetvl_e8m2(width - j);
-                vuint8m2_t p00, p01, p02, p03, p04;
-                vuint8m2_t p05, p06, p07, p08, p09;
-                vuint8m2_t p010, p011, p012, p013, p014;
-                vuint8m2_t p015, p016, p017, p018, p019;
-                vuint8m2_t p020, p021, p022, p023, p024;
-                vuint8m2_t p10, p11, p12, p13, p14;
-                vuint8m2_t p15, p16, p17, p18, p19;
-                vuint8m2_t p110, p111, p112, p113, p114;
-                vuint8m2_t p115, p116, p117, p118, p119;
-                vuint8m2_t p120, p121, p122, p123, p124;
-                vuint8m2_t p20, p21, p22, p23, p24;
-                vuint8m2_t p25, p26, p27, p28, p29;
-                vuint8m2_t p210, p211, p212, p213, p214;
-                vuint8m2_t p215, p216, p217, p218, p219;
-                vuint8m2_t p220, p221, p222, p223, p224;
-                vuint8m2_t p30, p31, p32, p33, p34;
-                vuint8m2_t p35, p36, p37, p38, p39;
-                vuint8m2_t p310, p311, p312, p313, p314;
-                vuint8m2_t p315, p316, p317, p318, p319;
-                vuint8m2_t p320, p321, p322, p323, p324;
-                auto loadsrc = [&vl](const uchar* row, vuint8m2_t& p0, vuint8m2_t& p1, vuint8m2_t& p2, vuint8m2_t& p3) {
-                    auto src = __riscv_vlseg4e8_v_u8m2x4(row, vl);
-                    p0 = __riscv_vget_v_u8m2x4_u8m2(src, 0);
-                    p1 = __riscv_vget_v_u8m2x4_u8m2(src, 1);
-                    p2 = __riscv_vget_v_u8m2x4_u8m2(src, 2);
-                    p3 = __riscv_vget_v_u8m2x4_u8m2(src, 3);
-                };
-                if (j >= 2)
-                {
-                    loadsrc(row0 + (j - 2) * 4, p00, p10, p20, p30);
-                    loadsrc(row1 + (j - 2) * 4, p05, p15, p25, p35);
-                    loadsrc(row2 + (j - 2) * 4, p010, p110, p210, p310);
-                    loadsrc(row3 + (j - 2) * 4, p015, p115, p215, p315);
-                    loadsrc(row4 + (j - 2) * 4, p020, p120, p220, p320);
-                }
-                else
-                {
-                    loadsrc(row0, p00, p10, p20, p30);
-                    loadsrc(row1, p05, p15, p25, p35);
-                    loadsrc(row2, p010, p110, p210, p310);
-                    loadsrc(row3, p015, p115, p215, p315);
-                    loadsrc(row4, p020, p120, p220, p320);
-                    auto slideup = [&] {
-                        p00 = __riscv_vslide1up(p00, row0[0], vl);
-                        p10 = __riscv_vslide1up(p10, row0[1], vl);
-                        p20 = __riscv_vslide1up(p20, row0[2], vl);
-                        p30 = __riscv_vslide1up(p30, row0[3], vl);
-                        p05 = __riscv_vslide1up(p05, row1[0], vl);
-                        p15 = __riscv_vslide1up(p15, row1[1], vl);
-                        p25 = __riscv_vslide1up(p25, row1[2], vl);
-                        p35 = __riscv_vslide1up(p35, row1[3], vl);
-                        p010 = __riscv_vslide1up(p010, row2[0], vl);
-                        p110 = __riscv_vslide1up(p110, row2[1], vl);
-                        p210 = __riscv_vslide1up(p210, row2[2], vl);
-                        p310 = __riscv_vslide1up(p310, row2[3], vl);
-                        p015 = __riscv_vslide1up(p015, row3[0], vl);
-                        p115 = __riscv_vslide1up(p115, row3[1], vl);
-                        p215 = __riscv_vslide1up(p215, row3[2], vl);
-                        p315 = __riscv_vslide1up(p315, row3[3], vl);
-                        p020 = __riscv_vslide1up(p020, row4[0], vl);
-                        p120 = __riscv_vslide1up(p120, row4[1], vl);
-                        p220 = __riscv_vslide1up(p220, row4[2], vl);
-                        p320 = __riscv_vslide1up(p320, row4[3], vl);
-                    };
-                    slideup();
-                    if (j == 0)
-                    {
-                        slideup();
-                    }
-                }
-                p01 = __riscv_vslide1down(p00, row0[(j + vl - 2) * 4    ], vl);
-                p11 = __riscv_vslide1down(p10, row0[(j + vl - 2) * 4 + 1], vl);
-                p21 = __riscv_vslide1down(p20, row0[(j + vl - 2) * 4 + 2], vl);
-                p31 = __riscv_vslide1down(p30, row0[(j + vl - 2) * 4 + 3], vl);
-                p06 = __riscv_vslide1down(p05, row1[(j + vl - 2) * 4    ], vl);
-                p16 = __riscv_vslide1down(p15, row1[(j + vl - 2) * 4 + 1], vl);
-                p26 = __riscv_vslide1down(p25, row1[(j + vl - 2) * 4 + 2], vl);
-                p36 = __riscv_vslide1down(p35, row1[(j + vl - 2) * 4 + 3], vl);
-                p011 = __riscv_vslide1down(p010, row2[(j + vl - 2) * 4    ], vl);
-                p111 = __riscv_vslide1down(p110, row2[(j + vl - 2) * 4 + 1], vl);
-                p211 = __riscv_vslide1down(p210, row2[(j + vl - 2) * 4 + 2], vl);
-                p311 = __riscv_vslide1down(p310, row2[(j + vl - 2) * 4 + 3], vl);
-                p016 = __riscv_vslide1down(p015, row3[(j + vl - 2) * 4    ], vl);
-                p116 = __riscv_vslide1down(p115, row3[(j + vl - 2) * 4 + 1], vl);
-                p216 = __riscv_vslide1down(p215, row3[(j + vl - 2) * 4 + 2], vl);
-                p316 = __riscv_vslide1down(p315, row3[(j + vl - 2) * 4 + 3], vl);
-                p021 = __riscv_vslide1down(p020, row4[(j + vl - 2) * 4    ], vl);
-                p121 = __riscv_vslide1down(p120, row4[(j + vl - 2) * 4 + 1], vl);
-                p221 = __riscv_vslide1down(p220, row4[(j + vl - 2) * 4 + 2], vl);
-                p321 = __riscv_vslide1down(p320, row4[(j + vl - 2) * 4 + 3], vl);
-                p02 = __riscv_vslide1down(p01, row0[(j + vl - 1) * 4    ], vl);
-                p12 = __riscv_vslide1down(p11, row0[(j + vl - 1) * 4 + 1], vl);
-                p22 = __riscv_vslide1down(p21, row0[(j + vl - 1) * 4 + 2], vl);
-                p32 = __riscv_vslide1down(p31, row0[(j + vl - 1) * 4 + 3], vl);
-                p07 = __riscv_vslide1down(p06, row1[(j + vl - 1) * 4    ], vl);
-                p17 = __riscv_vslide1down(p16, row1[(j + vl - 1) * 4 + 1], vl);
-                p27 = __riscv_vslide1down(p26, row1[(j + vl - 1) * 4 + 2], vl);
-                p37 = __riscv_vslide1down(p36, row1[(j + vl - 1) * 4 + 3], vl);
-                p012 = __riscv_vslide1down(p011, row2[(j + vl - 1) * 4    ], vl);
-                p112 = __riscv_vslide1down(p111, row2[(j + vl - 1) * 4 + 1], vl);
-                p212 = __riscv_vslide1down(p211, row2[(j + vl - 1) * 4 + 2], vl);
-                p312 = __riscv_vslide1down(p311, row2[(j + vl - 1) * 4 + 3], vl);
-                p017 = __riscv_vslide1down(p016, row3[(j + vl - 1) * 4    ], vl);
-                p117 = __riscv_vslide1down(p116, row3[(j + vl - 1) * 4 + 1], vl);
-                p217 = __riscv_vslide1down(p216, row3[(j + vl - 1) * 4 + 2], vl);
-                p317 = __riscv_vslide1down(p316, row3[(j + vl - 1) * 4 + 3], vl);
-                p022 = __riscv_vslide1down(p021, row4[(j + vl - 1) * 4    ], vl);
-                p122 = __riscv_vslide1down(p121, row4[(j + vl - 1) * 4 + 1], vl);
-                p222 = __riscv_vslide1down(p221, row4[(j + vl - 1) * 4 + 2], vl);
-                p322 = __riscv_vslide1down(p321, row4[(j + vl - 1) * 4 + 3], vl);
-                p03 = __riscv_vslide1down(p02, row0[std::min(width - 1, j + vl) * 4    ], vl);
-                p13 = __riscv_vslide1down(p12, row0[std::min(width - 1, j + vl) * 4 + 1], vl);
-                p23 = __riscv_vslide1down(p22, row0[std::min(width - 1, j + vl) * 4 + 2], vl);
-                p33 = __riscv_vslide1down(p32, row0[std::min(width - 1, j + vl) * 4 + 3], vl);
-                p08 = __riscv_vslide1down(p07, row1[std::min(width - 1, j + vl) * 4    ], vl);
-                p18 = __riscv_vslide1down(p17, row1[std::min(width - 1, j + vl) * 4 + 1], vl);
-                p28 = __riscv_vslide1down(p27, row1[std::min(width - 1, j + vl) * 4 + 2], vl);
-                p38 = __riscv_vslide1down(p37, row1[std::min(width - 1, j + vl) * 4 + 3], vl);
-                p013 = __riscv_vslide1down(p012, row2[std::min(width - 1, j + vl) * 4    ], vl);
-                p113 = __riscv_vslide1down(p112, row2[std::min(width - 1, j + vl) * 4 + 1], vl);
-                p213 = __riscv_vslide1down(p212, row2[std::min(width - 1, j + vl) * 4 + 2], vl);
-                p313 = __riscv_vslide1down(p312, row2[std::min(width - 1, j + vl) * 4 + 3], vl);
-                p018 = __riscv_vslide1down(p017, row3[std::min(width - 1, j + vl) * 4    ], vl);
-                p118 = __riscv_vslide1down(p117, row3[std::min(width - 1, j + vl) * 4 + 1], vl);
-                p218 = __riscv_vslide1down(p217, row3[std::min(width - 1, j + vl) * 4 + 2], vl);
-                p318 = __riscv_vslide1down(p317, row3[std::min(width - 1, j + vl) * 4 + 3], vl);
-                p023 = __riscv_vslide1down(p022, row4[std::min(width - 1, j + vl) * 4    ], vl);
-                p123 = __riscv_vslide1down(p122, row4[std::min(width - 1, j + vl) * 4 + 1], vl);
-                p223 = __riscv_vslide1down(p222, row4[std::min(width - 1, j + vl) * 4 + 2], vl);
-                p323 = __riscv_vslide1down(p322, row4[std::min(width - 1, j + vl) * 4 + 3], vl);
-                p04 = __riscv_vslide1down(p03, row0[std::min(width - 1, j + vl + 1) * 4    ], vl);
-                p14 = __riscv_vslide1down(p13, row0[std::min(width - 1, j + vl + 1) * 4 + 1], vl);
-                p24 = __riscv_vslide1down(p23, row0[std::min(width - 1, j + vl + 1) * 4 + 2], vl);
-                p34 = __riscv_vslide1down(p33, row0[std::min(width - 1, j + vl + 1) * 4 + 3], vl);
-                p09 = __riscv_vslide1down(p08, row1[std::min(width - 1, j + vl + 1) * 4    ], vl);
-                p19 = __riscv_vslide1down(p18, row1[std::min(width - 1, j + vl + 1) * 4 + 1], vl);
-                p29 = __riscv_vslide1down(p28, row1[std::min(width - 1, j + vl + 1) * 4 + 2], vl);
-                p39 = __riscv_vslide1down(p38, row1[std::min(width - 1, j + vl + 1) * 4 + 3], vl);
-                p014 = __riscv_vslide1down(p013, row2[std::min(width - 1, j + vl + 1) * 4    ], vl);
-                p114 = __riscv_vslide1down(p113, row2[std::min(width - 1, j + vl + 1) * 4 + 1], vl);
-                p214 = __riscv_vslide1down(p213, row2[std::min(width - 1, j + vl + 1) * 4 + 2], vl);
-                p314 = __riscv_vslide1down(p313, row2[std::min(width - 1, j + vl + 1) * 4 + 3], vl);
-                p019 = __riscv_vslide1down(p018, row3[std::min(width - 1, j + vl + 1) * 4    ], vl);
-                p119 = __riscv_vslide1down(p118, row3[std::min(width - 1, j + vl + 1) * 4 + 1], vl);
-                p219 = __riscv_vslide1down(p218, row3[std::min(width - 1, j + vl + 1) * 4 + 2], vl);
-                p319 = __riscv_vslide1down(p318, row3[std::min(width - 1, j + vl + 1) * 4 + 3], vl);
-                p024 = __riscv_vslide1down(p023, row4[std::min(width - 1, j + vl + 1) * 4    ], vl);
-                p124 = __riscv_vslide1down(p123, row4[std::min(width - 1, j + vl + 1) * 4 + 1], vl);
-                p224 = __riscv_vslide1down(p223, row4[std::min(width - 1, j + vl + 1) * 4 + 2], vl);
-                p324 = __riscv_vslide1down(p323, row4[std::min(width - 1, j + vl + 1) * 4 + 3], vl);
-
-                auto vop = [&vl](vuint8m2_t& a, vuint8m2_t& b) {
-                    auto t = a;
-                    a = __riscv_vminu(a, b, vl);
-                    b = __riscv_vmaxu(t, b, vl);
-                };
-                vuint8m2x4_t dst{};
-                vop(p01, p02); vop(p00, p01); vop(p01, p02); vop(p04, p05); vop(p03, p04);
-                vop(p04, p05); vop(p00, p03); vop(p02, p05); vop(p02, p03); vop(p01, p04);
-                vop(p01, p02); vop(p03, p04); vop(p07, p08); vop(p06, p07); vop(p07, p08);
-                vop(p010, p011); vop(p09, p010); vop(p010, p011); vop(p06, p09); vop(p08, p011);
-                vop(p08, p09); vop(p07, p010); vop(p07, p08); vop(p09, p010); vop(p00, p06);
-                vop(p04, p010); vop(p04, p06); vop(p02, p08); vop(p02, p04); vop(p06, p08);
-                vop(p01, p07); vop(p05, p011); vop(p05, p07); vop(p03, p09); vop(p03, p05);
-                vop(p07, p09); vop(p01, p02); vop(p03, p04); vop(p05, p06); vop(p07, p08);
-                vop(p09, p010); vop(p013, p014); vop(p012, p013); vop(p013, p014); vop(p016, p017);
-                vop(p015, p016); vop(p016, p017); vop(p012, p015); vop(p014, p017); vop(p014, p015);
-                vop(p013, p016); vop(p013, p014); vop(p015, p016); vop(p019, p020); vop(p018, p019);
-                vop(p019, p020); vop(p021, p022); vop(p023, p024); vop(p021, p023); vop(p022, p024);
-                vop(p022, p023); vop(p018, p021); vop(p020, p023); vop(p020, p021); vop(p019, p022);
-                vop(p022, p024); vop(p019, p020); vop(p021, p022); vop(p023, p024); vop(p012, p018);
-                vop(p016, p022); vop(p016, p018); vop(p014, p020); vop(p020, p024); vop(p014, p016);
-                vop(p018, p020); vop(p022, p024); vop(p013, p019); vop(p017, p023); vop(p017, p019);
-                vop(p015, p021); vop(p015, p017); vop(p019, p021); vop(p013, p014); vop(p015, p016);
-                vop(p017, p018); vop(p019, p020); vop(p021, p022); vop(p023, p024); vop(p00, p012);
-                vop(p08, p020); vop(p08, p012); vop(p04, p016); vop(p016, p024); vop(p012, p016);
-                vop(p02, p014); vop(p010, p022); vop(p010, p014); vop(p06, p018); vop(p06, p010);
-                vop(p010, p012); vop(p01, p013); vop(p09, p021); vop(p09, p013); vop(p05, p017);
-                vop(p013, p017); vop(p03, p015); vop(p011, p023); vop(p011, p015); vop(p07, p019);
-                vop(p07, p011); vop(p011, p013); vop(p011, p012);
-                dst = __riscv_vset_v_u8m2_u8m2x4(dst, 0, p012);
-                vop(p11, p12); vop(p10, p11); vop(p11, p12); vop(p14, p15); vop(p13, p14);
-                vop(p14, p15); vop(p10, p13); vop(p12, p15); vop(p12, p13); vop(p11, p14);
-                vop(p11, p12); vop(p13, p14); vop(p17, p18); vop(p16, p17); vop(p17, p18);
-                vop(p110, p111); vop(p19, p110); vop(p110, p111); vop(p16, p19); vop(p18, p111);
-                vop(p18, p19); vop(p17, p110); vop(p17, p18); vop(p19, p110); vop(p10, p16);
-                vop(p14, p110); vop(p14, p16); vop(p12, p18); vop(p12, p14); vop(p16, p18);
-                vop(p11, p17); vop(p15, p111); vop(p15, p17); vop(p13, p19); vop(p13, p15);
-                vop(p17, p19); vop(p11, p12); vop(p13, p14); vop(p15, p16); vop(p17, p18);
-                vop(p19, p110); vop(p113, p114); vop(p112, p113); vop(p113, p114); vop(p116, p117);
-                vop(p115, p116); vop(p116, p117); vop(p112, p115); vop(p114, p117); vop(p114, p115);
-                vop(p113, p116); vop(p113, p114); vop(p115, p116); vop(p119, p120); vop(p118, p119);
-                vop(p119, p120); vop(p121, p122); vop(p123, p124); vop(p121, p123); vop(p122, p124);
-                vop(p122, p123); vop(p118, p121); vop(p120, p123); vop(p120, p121); vop(p119, p122);
-                vop(p122, p124); vop(p119, p120); vop(p121, p122); vop(p123, p124); vop(p112, p118);
-                vop(p116, p122); vop(p116, p118); vop(p114, p120); vop(p120, p124); vop(p114, p116);
-                vop(p118, p120); vop(p122, p124); vop(p113, p119); vop(p117, p123); vop(p117, p119);
-                vop(p115, p121); vop(p115, p117); vop(p119, p121); vop(p113, p114); vop(p115, p116);
-                vop(p117, p118); vop(p119, p120); vop(p121, p122); vop(p123, p124); vop(p10, p112);
-                vop(p18, p120); vop(p18, p112); vop(p14, p116); vop(p116, p124); vop(p112, p116);
-                vop(p12, p114); vop(p110, p122); vop(p110, p114); vop(p16, p118); vop(p16, p110);
-                vop(p110, p112); vop(p11, p113); vop(p19, p121); vop(p19, p113); vop(p15, p117);
-                vop(p113, p117); vop(p13, p115); vop(p111, p123); vop(p111, p115); vop(p17, p119);
-                vop(p17, p111); vop(p111, p113); vop(p111, p112);
-                dst = __riscv_vset_v_u8m2_u8m2x4(dst, 1, p112);
-                vop(p21, p22); vop(p20, p21); vop(p21, p22); vop(p24, p25); vop(p23, p24);
-                vop(p24, p25); vop(p20, p23); vop(p22, p25); vop(p22, p23); vop(p21, p24);
-                vop(p21, p22); vop(p23, p24); vop(p27, p28); vop(p26, p27); vop(p27, p28);
-                vop(p210, p211); vop(p29, p210); vop(p210, p211); vop(p26, p29); vop(p28, p211);
-                vop(p28, p29); vop(p27, p210); vop(p27, p28); vop(p29, p210); vop(p20, p26);
-                vop(p24, p210); vop(p24, p26); vop(p22, p28); vop(p22, p24); vop(p26, p28);
-                vop(p21, p27); vop(p25, p211); vop(p25, p27); vop(p23, p29); vop(p23, p25);
-                vop(p27, p29); vop(p21, p22); vop(p23, p24); vop(p25, p26); vop(p27, p28);
-                vop(p29, p210); vop(p213, p214); vop(p212, p213); vop(p213, p214); vop(p216, p217);
-                vop(p215, p216); vop(p216, p217); vop(p212, p215); vop(p214, p217); vop(p214, p215);
-                vop(p213, p216); vop(p213, p214); vop(p215, p216); vop(p219, p220); vop(p218, p219);
-                vop(p219, p220); vop(p221, p222); vop(p223, p224); vop(p221, p223); vop(p222, p224);
-                vop(p222, p223); vop(p218, p221); vop(p220, p223); vop(p220, p221); vop(p219, p222);
-                vop(p222, p224); vop(p219, p220); vop(p221, p222); vop(p223, p224); vop(p212, p218);
-                vop(p216, p222); vop(p216, p218); vop(p214, p220); vop(p220, p224); vop(p214, p216);
-                vop(p218, p220); vop(p222, p224); vop(p213, p219); vop(p217, p223); vop(p217, p219);
-                vop(p215, p221); vop(p215, p217); vop(p219, p221); vop(p213, p214); vop(p215, p216);
-                vop(p217, p218); vop(p219, p220); vop(p221, p222); vop(p223, p224); vop(p20, p212);
-                vop(p28, p220); vop(p28, p212); vop(p24, p216); vop(p216, p224); vop(p212, p216);
-                vop(p22, p214); vop(p210, p222); vop(p210, p214); vop(p26, p218); vop(p26, p210);
-                vop(p210, p212); vop(p21, p213); vop(p29, p221); vop(p29, p213); vop(p25, p217);
-                vop(p213, p217); vop(p23, p215); vop(p211, p223); vop(p211, p215); vop(p27, p219);
-                vop(p27, p211); vop(p211, p213); vop(p211, p212);
-                dst = __riscv_vset_v_u8m2_u8m2x4(dst, 2, p212);
-                vop(p31, p32); vop(p30, p31); vop(p31, p32); vop(p34, p35); vop(p33, p34);
-                vop(p34, p35); vop(p30, p33); vop(p32, p35); vop(p32, p33); vop(p31, p34);
-                vop(p31, p32); vop(p33, p34); vop(p37, p38); vop(p36, p37); vop(p37, p38);
-                vop(p310, p311); vop(p39, p310); vop(p310, p311); vop(p36, p39); vop(p38, p311);
-                vop(p38, p39); vop(p37, p310); vop(p37, p38); vop(p39, p310); vop(p30, p36);
-                vop(p34, p310); vop(p34, p36); vop(p32, p38); vop(p32, p34); vop(p36, p38);
-                vop(p31, p37); vop(p35, p311); vop(p35, p37); vop(p33, p39); vop(p33, p35);
-                vop(p37, p39); vop(p31, p32); vop(p33, p34); vop(p35, p36); vop(p37, p38);
-                vop(p39, p310); vop(p313, p314); vop(p312, p313); vop(p313, p314); vop(p316, p317);
-                vop(p315, p316); vop(p316, p317); vop(p312, p315); vop(p314, p317); vop(p314, p315);
-                vop(p313, p316); vop(p313, p314); vop(p315, p316); vop(p319, p320); vop(p318, p319);
-                vop(p319, p320); vop(p321, p322); vop(p323, p324); vop(p321, p323); vop(p322, p324);
-                vop(p322, p323); vop(p318, p321); vop(p320, p323); vop(p320, p321); vop(p319, p322);
-                vop(p322, p324); vop(p319, p320); vop(p321, p322); vop(p323, p324); vop(p312, p318);
-                vop(p316, p322); vop(p316, p318); vop(p314, p320); vop(p320, p324); vop(p314, p316);
-                vop(p318, p320); vop(p322, p324); vop(p313, p319); vop(p317, p323); vop(p317, p319);
-                vop(p315, p321); vop(p315, p317); vop(p319, p321); vop(p313, p314); vop(p315, p316);
-                vop(p317, p318); vop(p319, p320); vop(p321, p322); vop(p323, p324); vop(p30, p312);
-                vop(p38, p320); vop(p38, p312); vop(p34, p316); vop(p316, p324); vop(p312, p316);
-                vop(p32, p314); vop(p310, p322); vop(p310, p314); vop(p36, p318); vop(p36, p310);
-                vop(p310, p312); vop(p31, p313); vop(p39, p321); vop(p39, p313); vop(p35, p317);
-                vop(p313, p317); vop(p33, p315); vop(p311, p323); vop(p311, p315); vop(p37, p319);
-                vop(p37, p311); vop(p311, p313); vop(p311, p312);
-                dst = __riscv_vset_v_u8m2_u8m2x4(dst, 3, p312);
-                __riscv_vsseg4e8(dst_data + i * dst_step + j * 4, dst, vl);
-            }
-        }
-    }
-
-    return CV_HAL_ERROR_OK;
-}
-
-inline int medianBlur(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int depth, int cn, int ksize)
-{
-    const int type = CV_MAKETYPE(depth, cn);
-    if (type != CV_8UC1 && type != CV_8UC4 && type != CV_16UC1 && type != CV_16SC1 && type != CV_32FC1)
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-    if ((ksize != 3 && ksize != 5) || src_data == dst_data)
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-
-    switch (ksize*100 + type)
-    {
-    case 300 + CV_8UC1:
-        return filter::invoke(height, {medianBlurC1<3, RVV_U8M4>}, src_data, src_step, dst_data, dst_step, width, height);
-    case 300 + CV_16UC1:
-        return filter::invoke(height, {medianBlurC1<3, RVV_U16M4>}, src_data, src_step, dst_data, dst_step, width, height);
-    case 300 + CV_16SC1:
-        return filter::invoke(height, {medianBlurC1<3, RVV_I16M4>}, src_data, src_step, dst_data, dst_step, width, height);
-    case 300 + CV_32FC1:
-        return filter::invoke(height, {medianBlurC1<3, RVV_F32M4>}, src_data, src_step, dst_data, dst_step, width, height);
-    case 500 + CV_8UC1:
-        return filter::invoke(height, {medianBlurC1<5, RVV_U8M1>}, src_data, src_step, dst_data, dst_step, width, height);
-    case 500 + CV_16UC1:
-        return filter::invoke(height, {medianBlurC1<5, RVV_U16M1>}, src_data, src_step, dst_data, dst_step, width, height);
-    case 500 + CV_16SC1:
-        return filter::invoke(height, {medianBlurC1<5, RVV_I16M1>}, src_data, src_step, dst_data, dst_step, width, height);
-    case 500 + CV_32FC1:
-        return filter::invoke(height, {medianBlurC1<5, RVV_F32M1>}, src_data, src_step, dst_data, dst_step, width, height);
-
-    case 300 + CV_8UC4:
-        return filter::invoke(height, {medianBlurC4<3>}, src_data, src_step, dst_data, dst_step, width, height);
-    case 500 + CV_8UC4:
-        return filter::invoke(height, {medianBlurC4<5>}, src_data, src_step, dst_data, dst_step, width, height);
-    }
-
-    return CV_HAL_ERROR_NOT_IMPLEMENTED;
-}
-} // cv::cv_hal_rvv::medianBlur
-
-namespace boxFilter {
-#undef cv_hal_boxFilter
-#define cv_hal_boxFilter cv::cv_hal_rvv::boxFilter::boxFilter
-
-template<typename T> struct rvv;
-template<> struct rvv<uchar>
-{
-    static inline vuint16m8_t vcvt0(vuint8m4_t a, size_t b) { return __riscv_vzext_vf2(a, b); }
-    static inline vuint8m4_t vcvt1(vuint16m8_t a, size_t b) { return __riscv_vnclipu(a, 0, __RISCV_VXRM_RNU, b); }
-    static inline vuint16m8_t vdiv(vuint16m8_t a, ushort b, size_t c) { return __riscv_vdivu(__riscv_vadd(a, b / 2, c), b, c); }
-};
-template<> struct rvv<short>
-{
-    static inline vint32m8_t vcvt0(vint16m4_t a, size_t b) { return __riscv_vsext_vf2(a, b); }
-    static inline vint16m4_t vcvt1(vint32m8_t a, size_t b) { return __riscv_vnclip(a, 0, __RISCV_VXRM_RNU, b); }
-    static inline vint32m8_t vdiv(vint32m8_t a, int b, size_t c) { return __riscv_vdiv(__riscv_vadd(a, b / 2, c), b, c); }
-};
-template<> struct rvv<int>
-{
-    static inline vint32m8_t vcvt0(vint32m8_t a, size_t) { return a; }
-    static inline vint32m8_t vcvt1(vint32m8_t a, size_t) { return a; }
-    static inline vint32m8_t vdiv(vint32m8_t a, int b, size_t c) { return __riscv_vdiv(__riscv_vadd(a, b / 2, c), b, c); }
-};
-template<> struct rvv<float>
-{
-    static inline vfloat32m8_t vcvt0(vfloat32m8_t a, size_t) { return a; }
-    static inline vfloat32m8_t vcvt1(vfloat32m8_t a, size_t) { return a; }
-    static inline vfloat32m8_t vdiv(vfloat32m8_t a, float b, size_t c) { return __riscv_vfdiv(a, b, c); }
-};
-
-// the algorithm is same as cv_hal_sepFilter
-template<int ksize, typename helperT, typename helperWT, bool cast>
-static inline int boxFilterC1(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int full_width, int full_height, int offset_x, int offset_y, int anchor_x, int anchor_y, bool normalize, int border_type)
-{
-    using T = typename helperT::ElemType;
-    using WT = typename helperWT::ElemType;
-
-    constexpr int noval = std::numeric_limits<int>::max();
-    auto accessX = [&](int x) {
-        int pi = filter::borderInterpolate(offset_y + x - anchor_y, full_height, border_type);
-        return pi < 0 ? noval : pi - offset_y;
-    };
-    auto accessY = [&](int y) {
-        int pj = filter::borderInterpolate(offset_x + y - anchor_x, full_width, border_type);
-        return pj < 0 ? noval : pj - offset_x;
-    };
-    auto p2idx = [&](int x, int y){ return (x + ksize) % ksize * width + y; };
-
-    std::vector<WT> res(width * ksize);
-    auto process = [&](int x, int y) {
-        WT sum = 0;
-        for (int i = 0; i < ksize; i++)
-        {
-            int p = accessY(y + i);
-            if (p != noval)
-            {
-                sum += reinterpret_cast<const T*>(src_data + x * src_step)[p];
-            }
-        }
-        res[p2idx(x, y)] = sum;
-    };
-
-    const int left = anchor_x, right = width - (ksize - 1 - anchor_x);
-    for (int i = start - anchor_y; i < end + (ksize - 1 - anchor_y); i++)
-    {
-        if (i + offset_y >= 0 && i + offset_y < full_height)
-        {
-            if (left >= right)
-            {
-                for (int j = 0; j < width; j++)
-                    process(i, j);
-            }
-            else
-            {
-                for (int j = 0; j < left; j++)
-                    process(i, j);
-                for (int j = right; j < width; j++)
-                    process(i, j);
-
-                int vl;
-                for (int j = left; j < right; j += vl)
-                {
-                    vl = helperT::setvl(right - j);
-                    const T* extra = reinterpret_cast<const T*>(src_data + i * src_step) + j - anchor_x;
-                    auto src = rvv<T>::vcvt0(helperT::vload(extra, vl), vl);
-
-                    extra += vl;
-                    auto sum = src;
-                    src = helperWT::vslide1down(src, extra[0], vl);
-                    sum = helperWT::vadd(sum, src, vl);
-                    src = helperWT::vslide1down(src, extra[1], vl);
-                    sum = helperWT::vadd(sum, src, vl);
-                    if (ksize == 5)
-                    {
-                        src = helperWT::vslide1down(src, extra[2], vl);
-                        sum = helperWT::vadd(sum, src, vl);
-                        src = helperWT::vslide1down(src, extra[3], vl);
-                        sum = helperWT::vadd(sum, src, vl);
-                    }
-                    helperWT::vstore(res.data() + p2idx(i, j), sum, vl);
-                }
-            }
-        }
-
-        int cur = i - (ksize - 1 - anchor_y);
-        if (cur >= start)
-        {
-            const WT* row0 = accessX(cur    ) == noval ? nullptr : res.data() + p2idx(accessX(cur    ), 0);
-            const WT* row1 = accessX(cur + 1) == noval ? nullptr : res.data() + p2idx(accessX(cur + 1), 0);
-            const WT* row2 = accessX(cur + 2) == noval ? nullptr : res.data() + p2idx(accessX(cur + 2), 0);
-            const WT* row3 = nullptr, *row4 = nullptr;
-            if (ksize == 5)
-            {
-                row3 = accessX(cur + 3) == noval ? nullptr : res.data() + p2idx(accessX(cur + 3), 0);
-                row4 = accessX(cur + 4) == noval ? nullptr : res.data() + p2idx(accessX(cur + 4), 0);
-            }
-
-            int vl;
-            for (int j = 0; j < width; j += vl)
-            {
-                vl = helperWT::setvl(width - j);
-                auto sum = row0 ? helperWT::vload(row0 + j, vl) : helperWT::vmv(0, vl);
-                if (row1) sum = helperWT::vadd(sum, helperWT::vload(row1 + j, vl), vl);
-                if (row2) sum = helperWT::vadd(sum, helperWT::vload(row2 + j, vl), vl);
-                if (row3) sum = helperWT::vadd(sum, helperWT::vload(row3 + j, vl), vl);
-                if (row4) sum = helperWT::vadd(sum, helperWT::vload(row4 + j, vl), vl);
-                if (normalize) sum = rvv<T>::vdiv(sum, ksize * ksize, vl);
-
-                if (cast)
-                {
-                    helperT::vstore(reinterpret_cast<T*>(dst_data + cur * dst_step) + j, rvv<T>::vcvt1(sum, vl), vl);
-                }
-                else
-                {
-                    helperWT::vstore(reinterpret_cast<WT*>(dst_data + cur * dst_step) + j, sum, vl);
-                }
-            }
-        }
-    }
-
-    return CV_HAL_ERROR_OK;
-}
-
-template<int ksize>
-static inline int boxFilterC3(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int full_width, int full_height, int offset_x, int offset_y, int anchor_x, int anchor_y, bool normalize, int border_type)
-{
-    constexpr int noval = std::numeric_limits<int>::max();
-    auto accessX = [&](int x) {
-        int pi = filter::borderInterpolate(offset_y + x - anchor_y, full_height, border_type);
-        return pi < 0 ? noval : pi - offset_y;
-    };
-    auto accessY = [&](int y) {
-        int pj = filter::borderInterpolate(offset_x + y - anchor_x, full_width, border_type);
-        return pj < 0 ? noval : pj - offset_x;
-    };
-    auto p2idx = [&](int x, int y){ return ((x + ksize) % ksize * width + y) * 3; };
-
-    std::vector<float> res(width * ksize * 3);
-    auto process = [&](int x, int y) {
-        float sum0, sum1, sum2;
-        sum0 = sum1 = sum2 = 0;
-        for (int i = 0; i < ksize; i++)
-        {
-            int p = accessY(y + i);
-            if (p != noval)
-            {
-                sum0 += reinterpret_cast<const float*>(src_data + x * src_step)[p * 3    ];
-                sum1 += reinterpret_cast<const float*>(src_data + x * src_step)[p * 3 + 1];
-                sum2 += reinterpret_cast<const float*>(src_data + x * src_step)[p * 3 + 2];
-            }
-        }
-        res[p2idx(x, y)    ] = sum0;
-        res[p2idx(x, y) + 1] = sum1;
-        res[p2idx(x, y) + 2] = sum2;
-    };
-
-    const int left = anchor_x, right = width - (ksize - 1 - anchor_x);
-    for (int i = start - anchor_y; i < end + (ksize - 1 - anchor_y); i++)
-    {
-        if (i + offset_y >= 0 && i + offset_y < full_height)
-        {
-            if (left >= right)
-            {
-                for (int j = 0; j < width; j++)
-                    process(i, j);
-            }
-            else
-            {
-                for (int j = 0; j < left; j++)
-                    process(i, j);
-                for (int j = right; j < width; j++)
-                    process(i, j);
-
-                int vl;
-                for (int j = left; j < right; j += vl)
-                {
-                    vl = __riscv_vsetvl_e32m2(right - j);
-                    const float* extra = reinterpret_cast<const float*>(src_data + i * src_step) + (j - anchor_x) * 3;
-                    auto src = __riscv_vlseg3e32_v_f32m2x3(extra, vl);
-                    auto src0 = __riscv_vget_v_f32m2x3_f32m2(src, 0);
-                    auto src1 = __riscv_vget_v_f32m2x3_f32m2(src, 1);
-                    auto src2 = __riscv_vget_v_f32m2x3_f32m2(src, 2);
-
-                    extra += vl * 3;
-                    auto sum0 = src0, sum1 = src1, sum2 = src2;
-                    src0 = __riscv_vfslide1down(src0, extra[0], vl);
-                    src1 = __riscv_vfslide1down(src1, extra[1], vl);
-                    src2 = __riscv_vfslide1down(src2, extra[2], vl);
-                    sum0 = __riscv_vfadd(sum0, src0, vl);
-                    sum1 = __riscv_vfadd(sum1, src1, vl);
-                    sum2 = __riscv_vfadd(sum2, src2, vl);
-                    src0 = __riscv_vfslide1down(src0, extra[3], vl);
-                    src1 = __riscv_vfslide1down(src1, extra[4], vl);
-                    src2 = __riscv_vfslide1down(src2, extra[5], vl);
-                    sum0 = __riscv_vfadd(sum0, src0, vl);
-                    sum1 = __riscv_vfadd(sum1, src1, vl);
-                    sum2 = __riscv_vfadd(sum2, src2, vl);
-                    if (ksize == 5)
-                    {
-                        src0 = __riscv_vfslide1down(src0, extra[6], vl);
-                        src1 = __riscv_vfslide1down(src1, extra[7], vl);
-                        src2 = __riscv_vfslide1down(src2, extra[8], vl);
-                        sum0 = __riscv_vfadd(sum0, src0, vl);
-                        sum1 = __riscv_vfadd(sum1, src1, vl);
-                        sum2 = __riscv_vfadd(sum2, src2, vl);
-                        src0 = __riscv_vfslide1down(src0, extra[ 9], vl);
-                        src1 = __riscv_vfslide1down(src1, extra[10], vl);
-                        src2 = __riscv_vfslide1down(src2, extra[11], vl);
-                        sum0 = __riscv_vfadd(sum0, src0, vl);
-                        sum1 = __riscv_vfadd(sum1, src1, vl);
-                        sum2 = __riscv_vfadd(sum2, src2, vl);
-                    }
-
-                    vfloat32m2x3_t dst{};
-                    dst = __riscv_vset_v_f32m2_f32m2x3(dst, 0, sum0);
-                    dst = __riscv_vset_v_f32m2_f32m2x3(dst, 1, sum1);
-                    dst = __riscv_vset_v_f32m2_f32m2x3(dst, 2, sum2);
-                    __riscv_vsseg3e32(res.data() + p2idx(i, j), dst, vl);
-                }
-            }
-        }
-
-        int cur = i - (ksize - 1 - anchor_y);
-        if (cur >= start)
-        {
-            const float* row0 = accessX(cur    ) == noval ? nullptr : res.data() + p2idx(accessX(cur    ), 0);
-            const float* row1 = accessX(cur + 1) == noval ? nullptr : res.data() + p2idx(accessX(cur + 1), 0);
-            const float* row2 = accessX(cur + 2) == noval ? nullptr : res.data() + p2idx(accessX(cur + 2), 0);
-            const float* row3 = nullptr, *row4 = nullptr;
-            if (ksize == 5)
-            {
-                row3 = accessX(cur + 3) == noval ? nullptr : res.data() + p2idx(accessX(cur + 3), 0);
-                row4 = accessX(cur + 4) == noval ? nullptr : res.data() + p2idx(accessX(cur + 4), 0);
-            }
-
-            int vl;
-            for (int j = 0; j < width; j += vl)
-            {
-                vl = __riscv_vsetvl_e32m2(width - j);
-                vfloat32m2_t sum0, sum1, sum2;
-                sum0 = sum1 = sum2 = __riscv_vfmv_v_f_f32m2(0, vl);
-                auto loadres = [&](const float* row) {
-                    if (!row) return;
-                    auto src = __riscv_vlseg3e32_v_f32m2x3(row + j * 3, vl);
-                    sum0 = __riscv_vfadd(sum0, __riscv_vget_v_f32m2x3_f32m2(src, 0), vl);
-                    sum1 = __riscv_vfadd(sum1, __riscv_vget_v_f32m2x3_f32m2(src, 1), vl);
-                    sum2 = __riscv_vfadd(sum2, __riscv_vget_v_f32m2x3_f32m2(src, 2), vl);
-                };
-                loadres(row0);
-                loadres(row1);
-                loadres(row2);
-                loadres(row3);
-                loadres(row4);
-                if (normalize)
-                {
-                    sum0 = __riscv_vfdiv(sum0, ksize * ksize, vl);
-                    sum1 = __riscv_vfdiv(sum1, ksize * ksize, vl);
-                    sum2 = __riscv_vfdiv(sum2, ksize * ksize, vl);
-                }
-
-                vfloat32m2x3_t dst{};
-                dst = __riscv_vset_v_f32m2_f32m2x3(dst, 0, sum0);
-                dst = __riscv_vset_v_f32m2_f32m2x3(dst, 1, sum1);
-                dst = __riscv_vset_v_f32m2_f32m2x3(dst, 2, sum2);
-                __riscv_vsseg3e32(reinterpret_cast<float*>(dst_data + cur * dst_step) + j * 3, dst, vl);
-            }
-        }
-    }
-
-    return CV_HAL_ERROR_OK;
-}
-
-inline int boxFilter(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int src_depth, int dst_depth, int cn, int margin_left, int margin_top, int margin_right, int margin_bottom, size_t ksize_width, size_t ksize_height, int anchor_x, int anchor_y, bool normalize, int border_type)
-{
-    const int src_type = CV_MAKETYPE(src_depth, cn), dst_type = CV_MAKETYPE(dst_depth, cn);
-    if (ksize_width != ksize_height || (ksize_width != 3 && ksize_width != 5))
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-    if (border_type & BORDER_ISOLATED || border_type == BORDER_WRAP)
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-
-    uchar* _dst_data = dst_data;
-    size_t _dst_step = dst_step;
-    const size_t size = CV_ELEM_SIZE(dst_type);
-    std::vector<uchar> dst;
-    if (src_data == _dst_data)
-    {
-        dst = std::vector<uchar>(width * height * size);
-        dst_data = dst.data();
-        dst_step = width * size;
-    }
-
-    int res = CV_HAL_ERROR_NOT_IMPLEMENTED;
-    anchor_x = anchor_x < 0 ? ksize_width  / 2 : anchor_x;
-    anchor_y = anchor_y < 0 ? ksize_height / 2 : anchor_y;
-    if (src_type != dst_type)
-    {
-        if (src_type == CV_8UC1 && dst_type == CV_16UC1)
-        {
-            if (ksize_width == 3)
-            {
-                res = filter::invoke(height, {boxFilterC1<3, RVV_U8M4, RVV_U16M8, false>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type);
-            }
-            if (ksize_width == 5)
-            {
-                res = filter::invoke(height, {boxFilterC1<5, RVV_U8M4, RVV_U16M8, false>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type);
-            }
-        }
-    }
-    else
-    {
-        switch (ksize_width*100 + src_type)
-        {
-        case 300 + CV_8UC1:
-            res = filter::invoke(height, {boxFilterC1<3, RVV_U8M4, RVV_U16M8, true>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type);
-            break;
-        case 500 + CV_8UC1:
-            res = filter::invoke(height, {boxFilterC1<5, RVV_U8M4, RVV_U16M8, true>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type);
-            break;
-        case 300 + CV_16SC1:
-            res = filter::invoke(height, {boxFilterC1<3, RVV_I16M4, RVV_I32M8, true>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type);
-            break;
-        case 500 + CV_16SC1:
-            res = filter::invoke(height, {boxFilterC1<5, RVV_I16M4, RVV_I32M8, true>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type);
-            break;
-        case 300 + CV_32SC1:
-            res = filter::invoke(height, {boxFilterC1<3, RVV_I32M8, RVV_I32M8, true>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type);
-            break;
-        case 500 + CV_32SC1:
-            res = filter::invoke(height, {boxFilterC1<5, RVV_I32M8, RVV_I32M8, true>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type);
-            break;
-        case 300 + CV_32FC1:
-            res = filter::invoke(height, {boxFilterC1<3, RVV_F32M8, RVV_F32M8, true>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type);
-            break;
-        case 500 + CV_32FC1:
-            res = filter::invoke(height, {boxFilterC1<5, RVV_F32M8, RVV_F32M8, true>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type);
-            break;
-        case 300 + CV_32FC3:
-            res = filter::invoke(height, {boxFilterC3<3>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type);
-            break;
-        case 500 + CV_32FC3:
-            res = filter::invoke(height, {boxFilterC3<5>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type);
-            break;
-        }
-    }
-    if (res == CV_HAL_ERROR_NOT_IMPLEMENTED)
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-
-    if (src_data == _dst_data)
-    {
-        for (int i = 0; i < height; i++)
-            memcpy(_dst_data + i * _dst_step, dst.data() + i * dst_step, dst_step);
-    }
-
-    return res;
-}
-} // cv::cv_hal_rvv::boxFilter
-
-namespace bilateralFilter {
-#undef cv_hal_bilateralFilter
-#define cv_hal_bilateralFilter cv::cv_hal_rvv::bilateralFilter::bilateralFilter
-
-// the algorithm is copied from imgproc/src/bilateral_filter.simd.cpp
-// in the functor BilateralFilter_8u_Invoker
-static inline int bilateralFilter8UC1(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int radius, int maxk, const int* space_ofs, const float* space_weight, const float* color_weight)
-{
-    constexpr int align = 31;
-    std::vector<float> _sum(width + align), _wsum(width + align);
-    float* sum = reinterpret_cast<float*>(((size_t)_sum.data() + align) & ~align);
-    float* wsum = reinterpret_cast<float*>(((size_t)_wsum.data() + align) & ~align);
-
-    for (int i = start; i < end; i++)
-    {
-        const uchar* sptr = src_data + (i+radius) * src_step + radius;
-        memset(sum, 0, sizeof(float) * width);
-        memset(wsum, 0, sizeof(float) * width);
-        for(int k = 0; k < maxk; k++)
-        {
-            const uchar* ksptr = sptr + space_ofs[k];
-            int vl;
-            for (int j = 0; j < width; j += vl)
-            {
-                vl = __riscv_vsetvl_e8m2(width - j);
-                auto src = __riscv_vle8_v_u8m2(sptr + j, vl);
-                auto ksrc = __riscv_vle8_v_u8m2(ksptr + j, vl);
-                auto diff = __riscv_vsub(__riscv_vmaxu(src, ksrc, vl), __riscv_vminu(src, ksrc, vl), vl);
-                auto w = __riscv_vloxei16_v_f32m8(color_weight, __riscv_vmul(__riscv_vzext_vf2(diff, vl), sizeof(float), vl), vl);
-                w = __riscv_vfmul(w, space_weight[k], vl);
-
-                __riscv_vse32(wsum + j, __riscv_vfadd(w, __riscv_vle32_v_f32m8(wsum + j, vl), vl), vl);
-                __riscv_vse32(sum + j, __riscv_vfmadd(w, __riscv_vfwcvt_f(__riscv_vzext_vf2(ksrc, vl), vl), __riscv_vle32_v_f32m8(sum + j, vl), vl), vl);
-            }
-        }
-
-        int vl;
-        for (int j = 0; j < width; j += vl)
-        {
-            vl = __riscv_vsetvl_e8m2(width - j);
-            auto dst = __riscv_vfncvt_xu(__riscv_vfdiv(__riscv_vle32_v_f32m8(sum + j, vl), __riscv_vle32_v_f32m8(wsum + j, vl), vl), vl);
-            __riscv_vse8(dst_data + i * dst_step + j, __riscv_vncvt_x(dst, vl), vl);
-        }
-    }
-
-    return CV_HAL_ERROR_OK;
-}
-
-static inline int bilateralFilter8UC3(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int radius, int maxk, const int* space_ofs, const float* space_weight, const float* color_weight)
-{
-    constexpr int align = 31;
-    std::vector<float> _sum_b(width + align), _sum_g(width + align), _sum_r(width + align), _wsum(width + align);
-    float* sum_b = reinterpret_cast<float*>(((size_t)_sum_b.data() + align) & ~align);
-    float* sum_g = reinterpret_cast<float*>(((size_t)_sum_g.data() + align) & ~align);
-    float* sum_r = reinterpret_cast<float*>(((size_t)_sum_r.data() + align) & ~align);
-    float* wsum = reinterpret_cast<float*>(((size_t)_wsum.data() + align) & ~align);
-
-    for (int i = start; i < end; i++)
-    {
-        const uchar* sptr = src_data + (i+radius) * src_step + radius*3;
-        memset(sum_b, 0, sizeof(float) * width);
-        memset(sum_g, 0, sizeof(float) * width);
-        memset(sum_r, 0, sizeof(float) * width);
-        memset(wsum, 0, sizeof(float) * width);
-        for(int k = 0; k < maxk; k++)
-        {
-            const uchar* ksptr = sptr + space_ofs[k];
-            int vl;
-            for (int j = 0; j < width; j += vl)
-            {
-                vl = __riscv_vsetvl_e8m2(width - j);
-                auto src = __riscv_vlseg3e8_v_u8m2x3(sptr + j * 3, vl);
-                auto src0 = __riscv_vget_v_u8m2x3_u8m2(src, 0);
-                auto src1 = __riscv_vget_v_u8m2x3_u8m2(src, 1);
-                auto src2 = __riscv_vget_v_u8m2x3_u8m2(src, 2);
-                src = __riscv_vlseg3e8_v_u8m2x3(ksptr + j * 3, vl);
-                auto ksrc0 = __riscv_vget_v_u8m2x3_u8m2(src, 0);
-                auto ksrc1 = __riscv_vget_v_u8m2x3_u8m2(src, 1);
-                auto ksrc2 = __riscv_vget_v_u8m2x3_u8m2(src, 2);
-
-                auto diff0 = __riscv_vsub(__riscv_vmaxu(src0, ksrc0, vl), __riscv_vminu(src0, ksrc0, vl), vl);
-                auto diff1 = __riscv_vsub(__riscv_vmaxu(src1, ksrc1, vl), __riscv_vminu(src1, ksrc1, vl), vl);
-                auto diff2 = __riscv_vsub(__riscv_vmaxu(src2, ksrc2, vl), __riscv_vminu(src2, ksrc2, vl), vl);
-                auto w = __riscv_vloxei16_v_f32m8(color_weight, __riscv_vmul(__riscv_vadd(__riscv_vadd(__riscv_vzext_vf2(diff0, vl), __riscv_vzext_vf2(diff1, vl), vl), __riscv_vzext_vf2(diff2, vl), vl), sizeof(float), vl), vl);
-                w = __riscv_vfmul(w, space_weight[k], vl);
-
-                __riscv_vse32(wsum + j, __riscv_vfadd(w, __riscv_vle32_v_f32m8(wsum + j, vl), vl), vl);
-                __riscv_vse32(sum_b + j, __riscv_vfmadd(w, __riscv_vfwcvt_f(__riscv_vzext_vf2(ksrc0, vl), vl), __riscv_vle32_v_f32m8(sum_b + j, vl), vl), vl);
-                __riscv_vse32(sum_g + j, __riscv_vfmadd(w, __riscv_vfwcvt_f(__riscv_vzext_vf2(ksrc1, vl), vl), __riscv_vle32_v_f32m8(sum_g + j, vl), vl), vl);
-                __riscv_vse32(sum_r + j, __riscv_vfmadd(w, __riscv_vfwcvt_f(__riscv_vzext_vf2(ksrc2, vl), vl), __riscv_vle32_v_f32m8(sum_r + j, vl), vl), vl);
-            }
-        }
-
-        int vl;
-        for (int j = 0; j < width; j += vl)
-        {
-            vl = __riscv_vsetvl_e8m2(width - j);
-            auto w = __riscv_vfrdiv(__riscv_vle32_v_f32m8(wsum + j, vl), 1.0f, vl);
-            vuint8m2x3_t dst{};
-            dst = __riscv_vset_v_u8m2_u8m2x3(dst, 0,__riscv_vncvt_x(__riscv_vfncvt_xu(__riscv_vfmul(__riscv_vle32_v_f32m8(sum_b + j, vl), w, vl), vl), vl));
-            dst = __riscv_vset_v_u8m2_u8m2x3(dst, 1,__riscv_vncvt_x(__riscv_vfncvt_xu(__riscv_vfmul(__riscv_vle32_v_f32m8(sum_g + j, vl), w, vl), vl), vl));
-            dst = __riscv_vset_v_u8m2_u8m2x3(dst, 2,__riscv_vncvt_x(__riscv_vfncvt_xu(__riscv_vfmul(__riscv_vle32_v_f32m8(sum_r + j, vl), w, vl), vl), vl));
-            __riscv_vsseg3e8(dst_data + i * dst_step + j * 3, dst, vl);
-        }
-    }
-
-    return CV_HAL_ERROR_OK;
-}
-
-// the algorithm is copied from imgproc/src/bilateral_filter.simd.cpp
-// in the functor BilateralFilter_32f_Invoker
-static inline int bilateralFilter32FC1(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int radius, int maxk, const int* space_ofs, const float* space_weight, const float* expLUT, float scale_index)
-{
-    constexpr int align = 31;
-    std::vector<float> _sum(width + align), _wsum(width + align);
-    float* sum = reinterpret_cast<float*>(((size_t)_sum.data() + align) & ~align);
-    float* wsum = reinterpret_cast<float*>(((size_t)_wsum.data() + align) & ~align);
-
-    for (int i = start; i < end; i++)
-    {
-        const float* sptr = reinterpret_cast<const float*>(src_data + (i+radius) * src_step) + radius;
-        memset(sum, 0, sizeof(float) * width);
-        memset(wsum, 0, sizeof(float) * width);
-        for(int k = 0; k < maxk; k++)
-        {
-            const float* ksptr = sptr + space_ofs[k];
-            int vl;
-            for (int j = 0; j < width; j += vl)
-            {
-                vl = __riscv_vsetvl_e32m4(width - j);
-                auto src = __riscv_vle32_v_f32m4(sptr + j, vl);
-                auto ksrc = __riscv_vle32_v_f32m4(ksptr + j, vl);
-                auto diff = __riscv_vfmul(__riscv_vfabs(__riscv_vfsub(src, ksrc, vl), vl), scale_index, vl);
-                auto idx = __riscv_vfcvt_rtz_x(diff, vl);
-                auto alpha = __riscv_vfsub(diff, __riscv_vfcvt_f(idx, vl), vl);
-
-                auto exp = __riscv_vloxseg2ei32_v_f32m4x2(expLUT, __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vmul(idx, sizeof(float), vl)), vl);
-                auto w = __riscv_vfmadd(alpha, __riscv_vfsub(__riscv_vget_v_f32m4x2_f32m4(exp, 1), __riscv_vget_v_f32m4x2_f32m4(exp, 0), vl), __riscv_vget_v_f32m4x2_f32m4(exp, 0), vl);
-                w = __riscv_vfmul(w, space_weight[k], vl);
-
-                __riscv_vse32(wsum + j, __riscv_vfadd(w, __riscv_vle32_v_f32m4(wsum + j, vl), vl), vl);
-                __riscv_vse32(sum + j, __riscv_vfmadd(w, ksrc, __riscv_vle32_v_f32m4(sum + j, vl), vl), vl);
-            }
-        }
-
-        int vl;
-        for (int j = 0; j < width; j += vl)
-        {
-            vl = __riscv_vsetvl_e32m4(width - j);
-            auto src = __riscv_vle32_v_f32m4(sptr + j, vl);
-            auto dst = __riscv_vfdiv(__riscv_vfadd(__riscv_vle32_v_f32m4(sum + j, vl), src, vl), __riscv_vfadd(__riscv_vle32_v_f32m4(wsum + j, vl), 1, vl), vl);
-            __riscv_vse32(reinterpret_cast<float*>(dst_data + i * dst_step) + j, dst, vl);
-        }
-    }
-
-    return CV_HAL_ERROR_OK;
-}
-
-static inline int bilateralFilter32FC3(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int radius, int maxk, const int* space_ofs, const float* space_weight, const float* expLUT, float scale_index)
-{
-    constexpr int align = 31;
-    std::vector<float> _sum_b(width + align), _sum_g(width + align), _sum_r(width + align), _wsum(width + align);
-    float* sum_b = reinterpret_cast<float*>(((size_t)_sum_b.data() + align) & ~align);
-    float* sum_g = reinterpret_cast<float*>(((size_t)_sum_g.data() + align) & ~align);
-    float* sum_r = reinterpret_cast<float*>(((size_t)_sum_r.data() + align) & ~align);
-    float* wsum = reinterpret_cast<float*>(((size_t)_wsum.data() + align) & ~align);
-
-    for (int i = start; i < end; i++)
-    {
-        const float* sptr = reinterpret_cast<const float*>(src_data + (i+radius) * src_step) + radius*3;
-        memset(sum_b, 0, sizeof(float) * width);
-        memset(sum_g, 0, sizeof(float) * width);
-        memset(sum_r, 0, sizeof(float) * width);
-        memset(wsum, 0, sizeof(float) * width);
-        for(int k = 0; k < maxk; k++)
-        {
-            const float* ksptr = sptr + space_ofs[k];
-            int vl;
-            for (int j = 0; j < width; j += vl)
-            {
-                vl = __riscv_vsetvl_e32m2(width - j);
-                auto src = __riscv_vlseg3e32_v_f32m2x3(sptr + j * 3, vl);
-                auto src0 = __riscv_vget_v_f32m2x3_f32m2(src, 0);
-                auto src1 = __riscv_vget_v_f32m2x3_f32m2(src, 1);
-                auto src2 = __riscv_vget_v_f32m2x3_f32m2(src, 2);
-                src = __riscv_vlseg3e32_v_f32m2x3(ksptr + j * 3, vl);
-                auto ksrc0 = __riscv_vget_v_f32m2x3_f32m2(src, 0);
-                auto ksrc1 = __riscv_vget_v_f32m2x3_f32m2(src, 1);
-                auto ksrc2 = __riscv_vget_v_f32m2x3_f32m2(src, 2);
-
-                auto diff = __riscv_vfmul(__riscv_vfadd(__riscv_vfadd(__riscv_vfabs(__riscv_vfsub(src0, ksrc0, vl), vl), __riscv_vfabs(__riscv_vfsub(src1, ksrc1, vl), vl), vl), __riscv_vfabs(__riscv_vfsub(src2, ksrc2, vl), vl), vl), scale_index, vl);
-                auto idx = __riscv_vfcvt_rtz_x(diff, vl);
-                auto alpha = __riscv_vfsub(diff, __riscv_vfcvt_f(idx, vl), vl);
-
-                auto exp = __riscv_vloxseg2ei32_v_f32m2x2(expLUT, __riscv_vreinterpret_v_i32m2_u32m2(__riscv_vmul(idx, sizeof(float), vl)), vl);
-                auto w = __riscv_vfmadd(alpha, __riscv_vfsub(__riscv_vget_v_f32m2x2_f32m2(exp, 1), __riscv_vget_v_f32m2x2_f32m2(exp, 0), vl), __riscv_vget_v_f32m2x2_f32m2(exp, 0), vl);
-                w = __riscv_vfmul(w, space_weight[k], vl);
-
-                __riscv_vse32(wsum + j, __riscv_vfadd(w, __riscv_vle32_v_f32m2(wsum + j, vl), vl), vl);
-                __riscv_vse32(sum_b + j, __riscv_vfmadd(w, ksrc0, __riscv_vle32_v_f32m2(sum_b + j, vl), vl), vl);
-                __riscv_vse32(sum_g + j, __riscv_vfmadd(w, ksrc1, __riscv_vle32_v_f32m2(sum_g + j, vl), vl), vl);
-                __riscv_vse32(sum_r + j, __riscv_vfmadd(w, ksrc2, __riscv_vle32_v_f32m2(sum_r + j, vl), vl), vl);
-            }
-        }
-
-        int vl;
-        for (int j = 0; j < width; j += vl)
-        {
-            vl = __riscv_vsetvl_e32m2(width - j);
-            auto w = __riscv_vfrdiv(__riscv_vfadd(__riscv_vle32_v_f32m2(wsum + j, vl), 1, vl), 1, vl);
-            auto src = __riscv_vlseg3e32_v_f32m2x3(sptr + j * 3, vl);
-            auto src0 = __riscv_vget_v_f32m2x3_f32m2(src, 0);
-            auto src1 = __riscv_vget_v_f32m2x3_f32m2(src, 1);
-            auto src2 = __riscv_vget_v_f32m2x3_f32m2(src, 2);
-
-            vfloat32m2x3_t dst{};
-            dst = __riscv_vset_v_f32m2_f32m2x3(dst, 0, __riscv_vfmul(w, __riscv_vfadd(__riscv_vle32_v_f32m2(sum_b + j, vl), src0, vl), vl));
-            dst = __riscv_vset_v_f32m2_f32m2x3(dst, 1, __riscv_vfmul(w, __riscv_vfadd(__riscv_vle32_v_f32m2(sum_g + j, vl), src1, vl), vl));
-            dst = __riscv_vset_v_f32m2_f32m2x3(dst, 2, __riscv_vfmul(w, __riscv_vfadd(__riscv_vle32_v_f32m2(sum_r + j, vl), src2, vl), vl));
-            __riscv_vsseg3e32(reinterpret_cast<float*>(dst_data + i * dst_step) + j * 3, dst, vl);
-        }
-    }
-
-    return CV_HAL_ERROR_OK;
-}
-
-// the algorithm is copied from imgproc/src/bilateral_filter.dispatch.cpp
-// in the function static void bilateralFilter_8u and bilateralFilter_32f
-inline int bilateralFilter(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step,
-                           int width, int height, int depth, int cn, int d, double sigma_color, double sigma_space, int border_type)
-{
-    const int type = CV_MAKETYPE(depth, cn);
-    if (type != CV_8UC1 && type != CV_8UC3 && type != CV_32FC1 && type != CV_32FC3)
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-    if (type == CV_32FC1 && width * height > 1 << 20)
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-    if (src_data == dst_data || border_type & BORDER_ISOLATED)
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-
-    sigma_color = sigma_color <= 0 ? 1 : sigma_color;
-    sigma_space = sigma_space <= 0 ? 1 : sigma_space;
-    double gauss_color_coeff = -0.5/(sigma_color*sigma_color);
-    double gauss_space_coeff = -0.5/(sigma_space*sigma_space);
-    int radius = d <= 0 ? std::round(sigma_space*1.5) : d/2;
-    radius = std::max(radius, 1);
-    d = radius*2 + 1;
-
-    const int size = depth == CV_32F ? cn * sizeof(float) : cn;
-    const int temp_step = (width + radius * 2) * size;
-    std::vector<uchar> _temp((width + radius * 2) * (height + radius * 2) * size, 0);
-    uchar* temp = _temp.data();
-    std::vector<int> width_interpolate(radius * 2);
-    for (int j = 0; j < radius; j++)
-    {
-        width_interpolate[j] = filter::borderInterpolate(j - radius, width, border_type);
-        width_interpolate[j + radius] = filter::borderInterpolate(width + j, width, border_type);
-    }
-    for (int i = 0; i < height + radius * 2; i++)
-    {
-        int x = filter::borderInterpolate(i - radius, height, border_type);
-        if (x != -1)
-        {
-            for (int j = 0; j < radius; j++)
-            {
-                int y = width_interpolate[j];
-                if (y != -1)
-                    memcpy(temp + i * temp_step + j * size, src_data + x * src_step + y * size, size);
-                y = width_interpolate[j + radius];
-                if (y != -1)
-                    memcpy(temp + i * temp_step + (width + j + radius) * size, src_data + x * src_step + y * size, size);
-            }
-            memcpy(temp + i * temp_step + radius * size, src_data + x * src_step, width * size);
-        }
-    }
-
-    std::vector<float> _space_weight(d*d);
-    std::vector<int> _space_ofs(d*d);
-    float* space_weight = _space_weight.data();
-    int* space_ofs = _space_ofs.data();
-    int maxk = 0;
-    for (int i = -radius; i <= radius; i++)
-    {
-        for (int j = -radius; j <= radius; j++)
-        {
-            double r = std::sqrt((double)i*i + (double)j*j);
-            if (r <= radius && (depth == CV_8U || i != 0 || j != 0))
-            {
-                space_weight[maxk] = static_cast<float>(r*r*gauss_space_coeff);
-                space_ofs[maxk++] = (i * (temp_step / size) + j) * cn;
-            }
-        }
-    }
-    cv::cv_hal_rvv::exp32f(space_weight, space_weight, maxk);
-
-    if (depth == CV_8U)
-    {
-        std::vector<float> _color_weight(cn*256);
-        float* color_weight = _color_weight.data();
-        for (int i = 0; i < 256*cn; i++)
-            color_weight[i] = static_cast<float>(i*i*gauss_color_coeff);
-        cv::cv_hal_rvv::exp32f(color_weight, color_weight, 256*cn);
-
-        switch (cn)
-        {
-        case 1:
-            return filter::invoke(height, {bilateralFilter8UC1}, temp, temp_step, dst_data, dst_step, width, radius, maxk, space_ofs, space_weight, color_weight);
-        case 3:
-            return filter::invoke(height, {bilateralFilter8UC3}, temp, temp_step, dst_data, dst_step, width, radius, maxk, space_ofs, space_weight, color_weight);
-        }
-    }
-    else
-    {
-        double minValSrc = -1, maxValSrc = 1;
-        cv::cv_hal_rvv::minmax::minMaxIdx(src_data, src_step, width * cn, height, CV_32F, &minValSrc, &maxValSrc, nullptr, nullptr, nullptr);
-        if(std::abs(minValSrc - maxValSrc) < FLT_EPSILON)
-        {
-            for (int i = 0; i < width; i++)
-                memcpy(dst_data + i * dst_step, src_data + i * src_step, width * size);
-            return CV_HAL_ERROR_OK;
-        }
-
-        const int kExpNumBinsPerChannel = 1 << 12;
-        const int kExpNumBins = kExpNumBinsPerChannel * cn;
-        const float scale_index = kExpNumBins / static_cast<float>((maxValSrc - minValSrc) * cn);
-        std::vector<float> _expLUT(kExpNumBins+2, 0);
-        float* expLUT = _expLUT.data();
-        for (int i = 0; i < kExpNumBins+2; i++)
-        {
-            double val = i / scale_index;
-            expLUT[i] = static_cast<float>(val * val * gauss_color_coeff);
-        }
-        cv::cv_hal_rvv::exp32f(expLUT, expLUT, kExpNumBins+2);
-
-        switch (cn)
-        {
-        case 1:
-            return filter::invoke(height, {bilateralFilter32FC1}, temp, temp_step, dst_data, dst_step, width, radius, maxk, space_ofs, space_weight, expLUT, scale_index);
-        case 3:
-            return filter::invoke(height, {bilateralFilter32FC3}, temp, temp_step, dst_data, dst_step, width, radius, maxk, space_ofs, space_weight, expLUT, scale_index);
-        }
-    }
-
-    return CV_HAL_ERROR_NOT_IMPLEMENTED;
-}
-} // cv::cv_hal_rvv::bilateralFilter
-
-}}
-
-#endif
diff --git a/hal/riscv-rvv/hal_rvv_1p0/histogram.hpp b/hal/riscv-rvv/hal_rvv_1p0/histogram.hpp
deleted file mode 100644
index 48f6123b0d..0000000000
--- a/hal/riscv-rvv/hal_rvv_1p0/histogram.hpp
+++ /dev/null
@@ -1,108 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.
-
-// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
-
-#ifndef OPENCV_HAL_RVV_HISTOGRAM_HPP_INCLUDED
-#define OPENCV_HAL_RVV_HISTOGRAM_HPP_INCLUDED
-
-#include <riscv_vector.h>
-
-namespace cv { namespace cv_hal_rvv {
-
-namespace equalize_hist {
-#undef cv_hal_equalize_hist
-#define cv_hal_equalize_hist cv::cv_hal_rvv::equalize_hist::equalize_hist
-
-class HistogramInvoker : public ParallelLoopBody
-{
-public:
-    template<typename... Args>
-    HistogramInvoker(std::function<void(int, int, Args...)> _func, Args&&... args)
-    {
-        func = std::bind(_func, std::placeholders::_1, std::placeholders::_2, std::forward<Args>(args)...);
-    }
-
-    virtual void operator()(const Range& range) const override
-    {
-        func(range.start, range.end);
-    }
-
-private:
-    std::function<void(int, int)> func;
-};
-
-constexpr int HIST_SZ = std::numeric_limits<uchar>::max() + 1;
-
-static inline void hist_invoke(int start, int end, const uchar* src_data, size_t src_step, int width, int* hist, std::mutex* m)
-{
-    int h[HIST_SZ] = {0};
-    for (int i = start; i < end; i++)
-    {
-        const uchar* src = src_data + i * src_step;
-        int j;
-        for (j = 0; j + 3 < width; j += 4)
-        {
-            int t0 = src[j], t1 = src[j+1];
-            h[t0]++; h[t1]++;
-            t0 = src[j+2]; t1 = src[j+3];
-            h[t0]++; h[t1]++;
-        }
-        for (; j < width; j++)
-        {
-            h[src[j]]++;
-        }
-    }
-
-    std::lock_guard<std::mutex> lk(*m);
-    for (int i = 0; i < HIST_SZ; i++)
-    {
-        hist[i] += h[i];
-    }
-}
-
-static inline void lut_invoke(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, const uchar* lut)
-{
-    for (int i = start; i < end; i++)
-    {
-        int vl;
-        for (int j = 0; j < width; j += vl)
-        {
-            vl = __riscv_vsetvl_e8m8(width - j);
-            auto src = __riscv_vle8_v_u8m8(src_data + i * src_step + j, vl);
-            auto dst = __riscv_vloxei8_v_u8m8(lut, src, vl);
-            __riscv_vse8(dst_data + i * dst_step + j, dst, vl);
-        }
-    }
-}
-
-// the algorithm is copied from imgproc/src/histogram.cpp,
-// in the function void cv::equalizeHist
-inline int equalize_hist(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height)
-{
-    int hist[HIST_SZ] = {0};
-    uchar lut[HIST_SZ];
-
-    std::mutex m;
-    cv::parallel_for_(Range(0, height), HistogramInvoker({hist_invoke}, src_data, src_step, width, reinterpret_cast<int *>(hist), &m), static_cast<double>(width * height) / (1 << 15));
-
-    int i = 0;
-    while (!hist[i]) ++i;
-
-    float scale = (HIST_SZ - 1.f)/(width * height - hist[i]);
-    int sum = 0;
-    for (lut[i++] = 0; i < HIST_SZ; i++)
-    {
-        sum += hist[i];
-        lut[i] = std::min(std::max(static_cast<int>(std::round(sum * scale)), 0), HIST_SZ - 1);
-    }
-    cv::parallel_for_(Range(0, height), HistogramInvoker({lut_invoke}, src_data, src_step, dst_data, dst_step, width, reinterpret_cast<const uchar*>(lut)), static_cast<double>(width * height) / (1 << 15));
-
-    return CV_HAL_ERROR_OK;
-}
-} // cv::cv_hal_rvv::equalize_hist
-
-}}
-
-#endif
diff --git a/hal/riscv-rvv/hal_rvv_1p0/polar_to_cart.hpp b/hal/riscv-rvv/hal_rvv_1p0/polar_to_cart.hpp
deleted file mode 100644
index feab2047e5..0000000000
--- a/hal/riscv-rvv/hal_rvv_1p0/polar_to_cart.hpp
+++ /dev/null
@@ -1,53 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.
-
-// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
-
-#ifndef OPENCV_HAL_RVV_POLAR_TO_CART_HPP_INCLUDED
-#define OPENCV_HAL_RVV_POLAR_TO_CART_HPP_INCLUDED
-
-#include <riscv_vector.h>
-#include "hal_rvv_1p0/sincos.hpp"
-#include "hal_rvv_1p0/types.hpp"
-
-namespace cv { namespace cv_hal_rvv {
-
-#undef cv_hal_polarToCart32f
-#define cv_hal_polarToCart32f cv::cv_hal_rvv::polarToCart<cv::cv_hal_rvv::RVV_F32M4>
-#undef cv_hal_polarToCart64f
-#define cv_hal_polarToCart64f cv::cv_hal_rvv::polarToCart<cv::cv_hal_rvv::RVV_F64M8>
-
-template <typename RVV_T, typename Elem = typename RVV_T::ElemType>
-inline int
-    polarToCart(const Elem* mag, const Elem* angle, Elem* x, Elem* y, int len, bool angleInDegrees)
-{
-    using T = RVV_F32M4;
-    const auto sincos_scale = angleInDegrees ? detail::sincos_deg_scale : detail::sincos_rad_scale;
-
-    size_t vl;
-    auto cos_p2 = T::vmv(detail::sincos_cos_p2, T::setvlmax());
-    auto cos_p0 = T::vmv(detail::sincos_cos_p0, T::setvlmax());
-    for (; len > 0; len -= (int)vl, angle += vl, x += vl, y += vl)
-    {
-        vl = RVV_T::setvl(len);
-        auto vangle = T::cast(RVV_T::vload(angle, vl), vl);
-        T::VecType vsin, vcos;
-        detail::SinCos32f<T>(vangle, vsin, vcos, sincos_scale, cos_p2, cos_p0, vl);
-        if (mag)
-        {
-            auto vmag = T::cast(RVV_T::vload(mag, vl), vl);
-            vsin = __riscv_vfmul(vsin, vmag, vl);
-            vcos = __riscv_vfmul(vcos, vmag, vl);
-            mag += vl;
-        }
-        RVV_T::vstore(x, RVV_T::cast(vcos, vl), vl);
-        RVV_T::vstore(y, RVV_T::cast(vsin, vl), vl);
-    }
-
-    return CV_HAL_ERROR_OK;
-}
-
-}}  // namespace cv::cv_hal_rvv
-
-#endif  // OPENCV_HAL_RVV_POLAR_TO_CART_HPP_INCLUDED
diff --git a/hal/riscv-rvv/hal_rvv_1p0/sqrt.hpp b/hal/riscv-rvv/hal_rvv_1p0/sqrt.hpp
deleted file mode 100644
index b87998d637..0000000000
--- a/hal/riscv-rvv/hal_rvv_1p0/sqrt.hpp
+++ /dev/null
@@ -1,131 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level
-// directory of this distribution and at http://opencv.org/license.html.
-
-// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
-
-#ifndef OPENCV_HAL_RVV_SQRT_HPP_INCLUDED
-#define OPENCV_HAL_RVV_SQRT_HPP_INCLUDED
-
-#include <riscv_vector.h>
-#include <cmath>
-#include "hal_rvv_1p0/types.hpp"
-
-namespace cv { namespace cv_hal_rvv {
-
-#undef cv_hal_sqrt32f
-#undef cv_hal_sqrt64f
-#undef cv_hal_invSqrt32f
-#undef cv_hal_invSqrt64f
-
-#define cv_hal_sqrt32f cv::cv_hal_rvv::sqrt<cv::cv_hal_rvv::Sqrt32f<cv::cv_hal_rvv::RVV_F32M8>>
-#define cv_hal_sqrt64f cv::cv_hal_rvv::sqrt<cv::cv_hal_rvv::Sqrt64f<cv::cv_hal_rvv::RVV_F64M8>>
-
-#ifdef __clang__
-// Strange bug in clang: invSqrt use 2 LMUL registers to store mask, which will cause memory access.
-// So a smaller LMUL is used here.
-#    define cv_hal_invSqrt32f cv::cv_hal_rvv::invSqrt<cv::cv_hal_rvv::Sqrt32f<cv::cv_hal_rvv::RVV_F32M4>>
-#    define cv_hal_invSqrt64f cv::cv_hal_rvv::invSqrt<cv::cv_hal_rvv::Sqrt64f<cv::cv_hal_rvv::RVV_F64M4>>
-#else
-#    define cv_hal_invSqrt32f cv::cv_hal_rvv::invSqrt<cv::cv_hal_rvv::Sqrt32f<cv::cv_hal_rvv::RVV_F32M8>>
-#    define cv_hal_invSqrt64f cv::cv_hal_rvv::invSqrt<cv::cv_hal_rvv::Sqrt64f<cv::cv_hal_rvv::RVV_F64M8>>
-#endif
-
-namespace detail {
-
-// Newton-Raphson method
-// Use 4 LMUL registers
-template <size_t iter_times, typename VEC_T>
-inline VEC_T sqrt(VEC_T x, size_t vl)
-{
-    auto x2 = __riscv_vfmul(x, 0.5, vl);
-    auto y = __riscv_vfrsqrt7(x, vl);
-#ifdef __clang__
-#pragma unroll
-#endif
-    for (size_t i = 0; i < iter_times; i++)
-    {
-        auto t = __riscv_vfmul(y, y, vl);
-        t = __riscv_vfmul(t, x2, vl);
-        t = __riscv_vfrsub(t, 1.5, vl);
-        y = __riscv_vfmul(t, y, vl);
-    }
-    // just to prevent the compiler from calculating mask before the iteration, which will run out
-    // of registers and cause memory access.
-    asm volatile("" ::: "memory");
-    auto classified = __riscv_vfclass(x, vl);
-    // block -0, +0, positive subnormal number, +inf
-    auto mask = __riscv_vmseq(__riscv_vand(classified, 0b10111000, vl), 0, vl);
-    return __riscv_vfmul_mu(mask, x, x, y, vl);
-}
-
-// Newton-Raphson method
-// Use 3 LMUL registers and 1 mask register
-template <size_t iter_times, typename VEC_T>
-inline VEC_T invSqrt(VEC_T x, size_t vl)
-{
-    auto classified = __riscv_vfclass(x, vl);
-    // block -0, +0, positive subnormal number, +inf
-    auto mask = __riscv_vmseq(__riscv_vand(classified, 0b10111000, vl), 0, vl);
-    auto x2 = __riscv_vfmul(x, 0.5, vl);
-    auto y = __riscv_vfrsqrt7(x, vl);
-#ifdef __clang__
-#pragma unroll
-#endif
-    for (size_t i = 0; i < iter_times; i++)
-    {
-        auto t = __riscv_vfmul(y, y, vl);
-        t = __riscv_vfmul(t, x2, vl);
-        t = __riscv_vfrsub(t, 1.5, vl);
-        y = __riscv_vfmul_mu(mask, y, t, y, vl);
-    }
-    return y;
-}
-
-}  // namespace detail
-
-template <typename RVV_T>
-struct Sqrt32f
-{
-    using T = RVV_T;
-    static constexpr size_t iter_times = 2;
-};
-
-template <typename RVV_T>
-struct Sqrt64f
-{
-    using T = RVV_T;
-    static constexpr size_t iter_times = 3;
-};
-
-template <typename SQRT_T, typename Elem = typename SQRT_T::T::ElemType>
-inline int sqrt(const Elem* src, Elem* dst, int _len)
-{
-    size_t vl;
-    for (size_t len = _len; len > 0; len -= vl, src += vl, dst += vl)
-    {
-        vl = SQRT_T::T::setvl(len);
-        auto x = SQRT_T::T::vload(src, vl);
-        SQRT_T::T::vstore(dst, detail::sqrt<SQRT_T::iter_times>(x, vl), vl);
-    }
-
-    return CV_HAL_ERROR_OK;
-}
-
-template <typename SQRT_T, typename Elem = typename SQRT_T::T::ElemType>
-inline int invSqrt(const Elem* src, Elem* dst, int _len)
-{
-    size_t vl;
-    for (size_t len = _len; len > 0; len -= vl, src += vl, dst += vl)
-    {
-        vl = SQRT_T::T::setvl(len);
-        auto x = SQRT_T::T::vload(src, vl);
-        SQRT_T::T::vstore(dst, detail::invSqrt<SQRT_T::iter_times>(x, vl), vl);
-    }
-
-    return CV_HAL_ERROR_OK;
-}
-
-}}  // namespace cv::cv_hal_rvv
-
-#endif  // OPENCV_HAL_RVV_SQRT_HPP_INCLUDED
diff --git a/hal/riscv-rvv/include/core.hpp b/hal/riscv-rvv/include/core.hpp
new file mode 100644
index 0000000000..b800420d42
--- /dev/null
+++ b/hal/riscv-rvv/include/core.hpp
@@ -0,0 +1,332 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_RVV_HAL_CORE_HPP
+#define OPENCV_RVV_HAL_CORE_HPP
+
+namespace cv { namespace rvv_hal { namespace core {
+
+#if CV_HAL_RVV_1P0_ENABLED
+
+/* ############ merge ############ */
+
+int merge8u(const uchar** src, uchar* dst, int len, int cn);
+int merge16u(const ushort** src, ushort* dst, int len, int cn);
+int merge32s(const int** src, int* dst, int len, int cn);
+int merge64s(const int64** src, int64* dst, int len, int cn);
+
+#undef cv_hal_merge8u
+#define cv_hal_merge8u cv::rvv_hal::core::merge8u
+#undef cv_hal_merge16u
+#define cv_hal_merge16u cv::rvv_hal::core::merge16u
+#undef cv_hal_merge32s
+#define cv_hal_merge32s cv::rvv_hal::core::merge32s
+#undef cv_hal_merge64s
+#define cv_hal_merge64s cv::rvv_hal::core::merge64s
+
+/* ############ meanStdDev ############ */
+
+int meanStdDev(const uchar* src_data, size_t src_step, int width, int height, int src_type,
+               double* mean_val, double* stddev_val, uchar* mask, size_t mask_step);
+
+#undef cv_hal_meanStdDev
+#define cv_hal_meanStdDev cv::rvv_hal::core::meanStdDev
+
+/* ############ dft ############ */
+
+int dft(const uchar* src, uchar* dst, int depth, int nf, int *factors, double scale,
+        int* itab, void* wave, int tab_size, int n, bool isInverse, bool noPermute);
+
+#undef cv_hal_dft
+#define cv_hal_dft cv::rvv_hal::core::dft
+
+/* ############ norm ############ */
+
+int norm(const uchar* src, size_t src_step, const uchar* mask, size_t mask_step,
+         int width, int height, int type, int norm_type, double* result);
+
+#undef cv_hal_norm
+#define cv_hal_norm cv::rvv_hal::core::norm
+
+/* ############ normDiff ############ */
+
+int normDiff(const uchar* src1, size_t src1_step, const uchar* src2, size_t src2_step,
+             const uchar* mask, size_t mask_step, int width, int height, int type,
+             int norm_type, double* result);
+
+#undef cv_hal_normDiff
+#define cv_hal_normDiff cv::rvv_hal::core::normDiff
+
+/* ############ normHamming ############ */
+
+int normHamming8u(const uchar* a, int n, int cellSize, int* result);
+int normHammingDiff8u(const uchar* a, const uchar* b, int n, int cellSize, int* result);
+
+#undef cv_hal_normHamming8u
+#define cv_hal_normHamming8u cv::rvv_hal::core::normHamming8u
+#undef cv_hal_normHammingDiff8u
+#define cv_hal_normHammingDiff8u cv::rvv_hal::core::normHammingDiff8u
+
+/* ############ convertScale ############ */
+
+int convertScale(const uchar* src, size_t src_step, uchar* dst, size_t dst_step,
+                 int width, int height, int sdepth, int ddepth, double alpha, double beta);
+
+#undef cv_hal_convertScale
+#define cv_hal_convertScale cv::rvv_hal::core::convertScale
+
+/* ############ minMaxIdx ############ */
+
+int minMaxIdx(const uchar* src_data, size_t src_step, int width, int height, int depth,
+              double* minVal, double* maxVal, int* minIdx, int* maxIdx, uchar* mask, size_t mask_step = 0);
+
+#undef cv_hal_minMaxIdx
+#define cv_hal_minMaxIdx cv::rvv_hal::core::minMaxIdx
+#undef cv_hal_minMaxIdxMaskStep
+#define cv_hal_minMaxIdxMaskStep cv::rvv_hal::core::minMaxIdx
+
+/* ############ fastAtan ############ */
+
+int fast_atan_32(const float* y, const float* x, float* dst, size_t n, bool angle_in_deg);
+int fast_atan_64(const double* y, const double* x, double* dst, size_t n, bool angle_in_deg);
+
+#undef cv_hal_fastAtan32f
+#define cv_hal_fastAtan32f cv::rvv_hal::core::fast_atan_32
+#undef cv_hal_fastAtan64f
+#define cv_hal_fastAtan64f cv::rvv_hal::core::fast_atan_64
+
+/* ############ split ############ */
+
+int split8u(const uchar* src, uchar** dst, int len, int cn);
+
+#undef cv_hal_split8u
+#define cv_hal_split8u cv::rvv_hal::core::split8u
+
+/* ############ sqrt ############ */
+
+int sqrt32f(const float* src, float* dst, int _len);
+int sqrt64f(const double* src, double* dst, int _len);
+
+#undef cv_hal_sqrt32f
+#define cv_hal_sqrt32f cv::rvv_hal::core::sqrt32f
+#undef cv_hal_sqrt64f
+#define cv_hal_sqrt64f cv::rvv_hal::core::sqrt64f
+
+int invSqrt32f(const float* src, float* dst, int _len);
+int invSqrt64f(const double* src, double* dst, int _len);
+
+#undef cv_hal_invSqrt32f
+#define cv_hal_invSqrt32f cv::rvv_hal::core::invSqrt32f
+#undef cv_hal_invSqrt64f
+#define cv_hal_invSqrt64f cv::rvv_hal::core::invSqrt64f
+
+/* ############ magnitude ############ */
+
+int magnitude32f(const float *x, const float *y, float *dst, int len);
+int magnitude64f(const double *x, const double  *y, double *dst, int len);
+
+#undef cv_hal_magnitude32f
+#define cv_hal_magnitude32f cv::rvv_hal::core::magnitude32f
+#undef cv_hal_magnitude64f
+#define cv_hal_magnitude64f cv::rvv_hal::core::magnitude64f
+
+/* ############ cartToPolar ############ */
+
+int cartToPolar32f(const float* x, const float* y, float* mag, float* angle, int len, bool angleInDegrees);
+int cartToPolar64f(const double* x, const double* y, double* mag, double* angle, int len, bool angleInDegrees);
+
+#undef cv_hal_cartToPolar32f
+#define cv_hal_cartToPolar32f cv::rvv_hal::core::cartToPolar32f
+#undef cv_hal_cartToPolar64f
+#define cv_hal_cartToPolar64f cv::rvv_hal::core::cartToPolar64f
+
+/* ############ polarToCart ############ */
+
+int polarToCart32f(const float* mag, const float* angle, float* x, float* y, int len, bool angleInDegrees);
+int polarToCart64f(const double* mag, const double* angle, double* x, double* y, int len, bool angleInDegrees);
+
+#undef cv_hal_polarToCart32f
+#define cv_hal_polarToCart32f cv::rvv_hal::core::polarToCart32f
+#undef cv_hal_polarToCart64f
+#define cv_hal_polarToCart64f cv::rvv_hal::core::polarToCart64f
+
+/* ############ polarToCart ############ */
+
+int flip(int src_type, const uchar* src_data, size_t src_step, int src_width, int src_height,
+         uchar* dst_data, size_t dst_step, int flip_mode);
+
+#undef cv_hal_flip
+#define cv_hal_flip cv::rvv_hal::core::flip
+
+/* ############ lut ############ */
+
+int lut(const uchar* src_data, size_t src_step, size_t src_type,
+        const uchar* lut_data, size_t lut_channel_size, size_t lut_channels,
+        uchar* dst_data, size_t dst_step, int width, int height);
+
+#undef cv_hal_lut
+#define cv_hal_lut cv::rvv_hal::core::lut
+
+/* ############ exp ############ */
+
+int exp32f(const float* src, float* dst, int _len);
+int exp64f(const double* src, double* dst, int _len);
+
+#undef cv_hal_exp32f
+#define cv_hal_exp32f cv::rvv_hal::core::exp32f
+#undef cv_hal_exp64f
+#define cv_hal_exp64f cv::rvv_hal::core::exp64f
+
+/* ############ log ############ */
+
+int log32f(const float* src, float* dst, int _len);
+int log64f(const double* src, double* dst, int _len);
+
+#undef cv_hal_log32f
+#define cv_hal_log32f cv::rvv_hal::core::log32f
+#undef cv_hal_log64f
+#define cv_hal_log64f cv::rvv_hal::core::log64f
+
+/* ############ lu ############ */
+
+int LU32f(float* src1, size_t src1_step, int m, float* src2, size_t src2_step, int n, int* info);
+int LU64f(double* src1, size_t src1_step, int m, double* src2, size_t src2_step, int n, int* info);
+
+#undef cv_hal_LU32f
+#define cv_hal_LU32f cv::rvv_hal::core::LU32f
+#undef cv_hal_LU64f
+#define cv_hal_LU64f cv::rvv_hal::core::LU64f
+
+/* ############ cholesky ############ */
+
+int Cholesky32f(float* src1, size_t src1_step, int m, float* src2, size_t src2_step, int n, bool* info);
+int Cholesky64f(double* src1, size_t src1_step, int m, double* src2, size_t src2_step, int n, bool* info);
+
+#undef cv_hal_Cholesky32f
+#define cv_hal_Cholesky32f cv::rvv_hal::core::Cholesky32f
+#undef cv_hal_Cholesky64f
+#define cv_hal_Cholesky64f cv::rvv_hal::core::Cholesky64f
+
+/* ############ qr ############ */
+
+int QR32f(float* src1, size_t src1_step, int m, int n, int k, float* src2, size_t src2_step, float* dst, int* info);
+int QR64f(double* src1, size_t src1_step, int m, int n, int k, double* src2, size_t src2_step, double* dst, int* info);
+
+#undef cv_hal_QR32f
+#define cv_hal_QR32f cv::rvv_hal::core::QR32f
+#undef cv_hal_QR64f
+#define cv_hal_QR64f cv::rvv_hal::core::QR64f
+
+/* ############ SVD ############ */
+
+int SVD32f(float* src, size_t src_step, float* w, float* u, size_t u_step, float* vt, size_t vt_step, int m, int n, int flags);
+int SVD64f(double* src, size_t src_step, double* w, double* u, size_t u_step, double* vt, size_t vt_step, int m, int n, int flags);
+
+#undef cv_hal_SVD32f
+#define cv_hal_SVD32f cv::rvv_hal::core::SVD32f
+#undef cv_hal_SVD64f
+#define cv_hal_SVD64f cv::rvv_hal::core::SVD64f
+
+/* ############ copyToMasked ############ */
+
+int copyToMasked(const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step, int width, int height,
+                 int type, const uchar *mask_data, size_t mask_step, int mask_type);
+
+#undef cv_hal_copyToMasked
+#define cv_hal_copyToMasked cv::rvv_hal::core::copyToMasked
+
+/* ############ div, recip ############ */
+
+int div8u(const uchar *src1_data, size_t src1_step, const uchar *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, double scale);
+int div8s(const schar *src1_data, size_t src1_step, const schar *src2_data, size_t src2_step, schar *dst_data, size_t dst_step, int width, int height, double scale);
+int div16u(const ushort *src1_data, size_t src1_step, const ushort *src2_data, size_t src2_step, ushort *dst_data, size_t dst_step, int width, int height, double scale);
+int div16s(const short *src1_data, size_t src1_step, const short *src2_data, size_t src2_step, short *dst_data, size_t dst_step, int width, int height, double scale);
+int div32s(const int *src1_data, size_t src1_step, const int *src2_data, size_t src2_step, int *dst_data, size_t dst_step, int width, int height, double scale);
+int div32f(const float *src1_data, size_t src1_step, const float *src2_data, size_t src2_step, float *dst_data, size_t dst_step, int width, int height, double scale);
+// int div64f(const double *src1_data, size_t src1_step, const double *src2_data, size_t src2_step, double *dst_data, size_t dst_step, int width, int height, double scale);
+
+#undef cv_hal_div8u
+#define cv_hal_div8u cv::rvv_hal::core::div8u
+#undef cv_hal_div8s
+#define cv_hal_div8s cv::rvv_hal::core::div8s
+#undef cv_hal_div16u
+#define cv_hal_div16u cv::rvv_hal::core::div16u
+#undef cv_hal_div16s
+#define cv_hal_div16s cv::rvv_hal::core::div16s
+#undef cv_hal_div32s
+#define cv_hal_div32s cv::rvv_hal::core::div32s
+#undef cv_hal_div32f
+#define cv_hal_div32f cv::rvv_hal::core::div32f
+// #undef cv_hal_div64f
+// #define cv_hal_div64f cv::rvv_hal::core::div64f
+
+int recip8u(const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step, int width, int height, double scale);
+int recip8s(const schar *src_data, size_t src_step, schar *dst_data, size_t dst_step, int width, int height, double scale);
+int recip16u(const ushort *src_data, size_t src_step, ushort *dst_data, size_t dst_step, int width, int height, double scale);
+int recip16s(const short *src_data, size_t src_step, short *dst_data, size_t dst_step, int width, int height, double scale);
+int recip32s(const int *src_data, size_t src_step, int *dst_data, size_t dst_step, int width, int height, double scale);
+int recip32f(const float *src_data, size_t src_step, float *dst_data, size_t dst_step, int width, int height, double scale);
+// int recip64f(const double *src_data, size_t src_step, double *dst_data, size_t dst_step, int width, int height, double scale);
+
+#undef cv_hal_recip8u
+#define cv_hal_recip8u cv::rvv_hal::core::recip8u
+#undef cv_hal_recip8s
+#define cv_hal_recip8s cv::rvv_hal::core::recip8s
+#undef cv_hal_recip16u
+#define cv_hal_recip16u cv::rvv_hal::core::recip16u
+#undef cv_hal_recip16s
+#define cv_hal_recip16s cv::rvv_hal::core::recip16s
+#undef cv_hal_recip32s
+#define cv_hal_recip32s cv::rvv_hal::core::recip32s
+#undef cv_hal_recip32f
+#define cv_hal_recip32f cv::rvv_hal::core::recip32f
+// #undef cv_hal_recip64f
+// #define cv_hal_recip64f cv::rvv_hal::core::recip64f
+
+/* ############ dotProduct ############ */
+
+int dotprod(const uchar *a_data, size_t a_step, const uchar *b_data, size_t b_step,
+            int width, int height, int type, double *dot_val);
+
+#undef cv_hal_dotProduct
+#define cv_hal_dotProduct cv::rvv_hal::core::dotprod
+
+/* ############ compare ############ */
+
+int cmp8u(const uchar *src1_data, size_t src1_step, const uchar *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, int operation);
+int cmp8s(const schar *src1_data, size_t src1_step, const schar *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, int operation);
+int cmp16u(const ushort *src1_data, size_t src1_step, const ushort *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, int operation);
+int cmp16s(const short *src1_data, size_t src1_step, const short *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, int operation);
+int cmp32s(const int *src1_data, size_t src1_step, const int *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, int operation);
+int cmp32f(const float *src1_data, size_t src1_step, const float *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, int operation);
+// int cmp64f(const double *src1_data, size_t src1_step, const double *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, int operation);
+
+#undef cv_hal_cmp8u
+#define cv_hal_cmp8u cv::rvv_hal::core::cmp8u
+#undef cv_hal_cmp8s
+#define cv_hal_cmp8s cv::rvv_hal::core::cmp8s
+#undef cv_hal_cmp16u
+#define cv_hal_cmp16u cv::rvv_hal::core::cmp16u
+#undef cv_hal_cmp16s
+#define cv_hal_cmp16s cv::rvv_hal::core::cmp16s
+#undef cv_hal_cmp32s
+#define cv_hal_cmp32s cv::rvv_hal::core::cmp32s
+#undef cv_hal_cmp32f
+#define cv_hal_cmp32f cv::rvv_hal::core::cmp32f
+// #undef cv_hal_cmp64f
+// #define cv_hal_cmp64f cv::rvv_hal::core::cmp64f
+
+/* ############ transpose2d ############ */
+
+int transpose2d(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step,
+                int src_width, int src_height, int element_size);
+
+#undef cv_hal_transpose2d
+#define cv_hal_transpose2d cv::rvv_hal::core::transpose2d
+
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+}}} // cv::rvv_hal::core
+
+#endif // OPENCV_RVV_HAL_CORE_HPP
diff --git a/hal/riscv-rvv/include/imgproc.hpp b/hal/riscv-rvv/include/imgproc.hpp
new file mode 100644
index 0000000000..17c414dd9b
--- /dev/null
+++ b/hal/riscv-rvv/include/imgproc.hpp
@@ -0,0 +1,256 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_RVV_HAL_IMGPROC_HPP
+#define OPENCV_RVV_HAL_IMGPROC_HPP
+
+struct cvhalFilter2D;
+
+namespace cv { namespace rvv_hal { namespace imgproc {
+
+#if CV_HAL_RVV_1P0_ENABLED
+
+/* ############ imageMoments ############ */
+
+int imageMoments(const uchar* src_data, size_t src_step, int src_type,
+                 int width, int height, bool binary, double m[10]);
+
+#undef cv_hal_imageMoments
+#define cv_hal_imageMoments cv::rvv_hal::imgproc::imageMoments
+
+/* ############ filter ############ */
+
+int filterInit(cvhalFilter2D** context, uchar* kernel_data, size_t kernel_step, int kernel_type, int kernel_width, int kernel_height, int /*max_width*/, int /*max_height*/, int src_type, int dst_type, int borderType, double delta, int anchor_x, int anchor_y, bool /*allowSubmatrix*/, bool /*allowInplace*/);
+int filter(cvhalFilter2D* context, uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int full_width, int full_height, int offset_x, int offset_y);
+int filterFree(cvhalFilter2D* context);
+
+#undef cv_hal_filterInit
+#define cv_hal_filterInit cv::rvv_hal::imgproc::filterInit
+#undef cv_hal_filter
+#define cv_hal_filter cv::rvv_hal::imgproc::filter
+#undef cv_hal_filterFree
+#define cv_hal_filterFree cv::rvv_hal::imgproc::filterFree
+
+/* ############ sepFilter ############ */
+
+int sepFilterInit(cvhalFilter2D **context, int src_type, int dst_type, int kernel_type, uchar* kernelx_data, int kernelx_length, uchar* kernely_data, int kernely_length, int anchor_x, int anchor_y, double delta, int borderType);
+int sepFilter(cvhalFilter2D *context, uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int full_width, int full_height, int offset_x, int offset_y);
+int sepFilterFree(cvhalFilter2D* context);
+
+#undef cv_hal_sepFilterInit
+#define cv_hal_sepFilterInit cv::rvv_hal::imgproc::sepFilterInit
+#undef cv_hal_sepFilter
+#define cv_hal_sepFilter cv::rvv_hal::imgproc::sepFilter
+#undef cv_hal_sepFilterFree
+#define cv_hal_sepFilterFree cv::rvv_hal::imgproc::sepFilterFree
+
+/* ############ morph ############ */
+
+int morphInit(cvhalFilter2D** context, int operation, int src_type, int dst_type, int /*max_width*/, int /*max_height*/, int kernel_type, uchar* kernel_data, size_t kernel_step, int kernel_width, int kernel_height, int anchor_x, int anchor_y, int borderType, const double borderValue[4], int iterations, bool /*allowSubmatrix*/, bool /*allowInplace*/);
+int morph(cvhalFilter2D* context, uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int src_full_width, int src_full_height, int src_roi_x, int src_roi_y, int /*dst_full_width*/, int /*dst_full_height*/, int /*dst_roi_x*/, int /*dst_roi_y*/);
+int morphFree(cvhalFilter2D* context);
+
+#undef cv_hal_morphInit
+#undef cv_hal_morph
+#undef cv_hal_morphFree
+#define cv_hal_morphInit cv::rvv_hal::imgproc::morphInit
+#define cv_hal_morph cv::rvv_hal::imgproc::morph
+#define cv_hal_morphFree cv::rvv_hal::imgproc::morphFree
+
+/* ############ gaussianBlur ############ */
+
+int gaussianBlurBinomial(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int depth, int cn, size_t margin_left, size_t margin_top, size_t margin_right, size_t margin_bottom, size_t ksize, int border_type);
+
+#undef cv_hal_gaussianBlurBinomial
+#define cv_hal_gaussianBlurBinomial cv::rvv_hal::imgproc::gaussianBlurBinomial
+
+/* ############ medianBlur ############ */
+
+int medianBlur(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int depth, int cn, int ksize);
+
+#undef cv_hal_medianBlur
+#define cv_hal_medianBlur cv::rvv_hal::imgproc::medianBlur
+
+/* ############ boxFilter ############ */
+
+int boxFilter(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int src_depth, int dst_depth, int cn, int margin_left, int margin_top, int margin_right, int margin_bottom, size_t ksize_width, size_t ksize_height, int anchor_x, int anchor_y, bool normalize, int border_type);
+
+#undef cv_hal_boxFilter
+#define cv_hal_boxFilter cv::rvv_hal::imgproc::boxFilter
+
+/* ############ bilateralFilter ############ */
+
+int bilateralFilter(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step,
+                    int width, int height, int depth, int cn, int d, double sigma_color,
+                    double sigma_space, int border_type);
+
+#undef cv_hal_bilateralFilter
+#define cv_hal_bilateralFilter cv::rvv_hal::imgproc::bilateralFilter
+
+/* ############ pyramid ############ */
+
+int pyrDown(const uchar* src_data, size_t src_step, int src_width, int src_height, uchar* dst_data, size_t dst_step, int dst_width, int dst_height, int depth, int cn, int border_type);
+int pyrUp(const uchar* src_data, size_t src_step, int src_width, int src_height, uchar* dst_data, size_t dst_step, int dst_width, int dst_height, int depth, int cn, int border_type);
+
+#undef cv_hal_pyrdown
+#define cv_hal_pyrdown cv::rvv_hal::imgproc::pyrDown
+#undef cv_hal_pyrup
+#define cv_hal_pyrup cv::rvv_hal::imgproc::pyrUp
+
+/* ############ cvtColor ############ */
+
+int cvtBGRtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, int dcn, bool swapBlue);
+int cvtGraytoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int dcn);
+int cvtBGRtoGray(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue);
+int cvtBGR5x5toBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int dcn, bool swapBlue, int greenBits);
+int cvtBGRtoBGR5x5(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int scn, bool swapBlue, int greenBits);
+int cvtBGR5x5toGray(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int greenBits);
+int cvtGraytoBGR5x5(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int greenBits);
+int cvtYUVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int dcn, bool swapBlue, bool isCbCr);
+int cvtBGRtoYUV(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue, bool isCbCr);
+int cvtOnePlaneYUVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int dst_width, int dst_height, int dcn, bool swapBlue, int uIdx, int yIdx);
+int cvtTwoPlaneYUVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int dst_width, int dst_height, int dcn, bool swapBlue, int uIdx);
+int cvtThreePlaneYUVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int dst_width, int dst_height, int dcn, bool swapBlue, int uIdx);
+int cvtOnePlaneBGRtoYUV(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int scn, bool swapBlue, int uIdx, int yIdx);
+int cvtBGRtoTwoPlaneYUV(const uchar * src_data, size_t src_step, uchar * y_data, size_t y_step, uchar * uv_data, size_t uv_step, int width, int height, int scn, bool swapBlue, int uIdx);
+int cvtBGRtoThreePlaneYUV(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int scn, bool swapBlue, int uIdx);
+int cvtHSVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int dcn, bool swapBlue, bool isFullRange, bool isHSV);
+int cvtBGRtoHSV(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue, bool isFullRange, bool isHSV);
+int cvtXYZtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int dcn, bool swapBlue);
+int cvtBGRtoXYZ(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue);
+int cvtLabtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int dcn, bool swapBlue, bool isLab, bool srgb);
+int cvtBGRtoLab(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue, bool isLab, bool srgb);
+
+#undef cv_hal_cvtBGRtoBGR
+#define cv_hal_cvtBGRtoBGR cv::rvv_hal::imgproc::cvtBGRtoBGR
+#undef cv_hal_cvtGraytoBGR
+#define cv_hal_cvtGraytoBGR cv::rvv_hal::imgproc::cvtGraytoBGR
+#undef cv_hal_cvtBGRtoGray
+#define cv_hal_cvtBGRtoGray cv::rvv_hal::imgproc::cvtBGRtoGray
+#undef cv_hal_cvtBGR5x5toBGR
+#define cv_hal_cvtBGR5x5toBGR cv::rvv_hal::imgproc::cvtBGR5x5toBGR
+#undef cv_hal_cvtBGRtoBGR5x5
+#define cv_hal_cvtBGRtoBGR5x5 cv::rvv_hal::imgproc::cvtBGRtoBGR5x5
+#undef cv_hal_cvtBGR5x5toGray
+#define cv_hal_cvtBGR5x5toGray cv::rvv_hal::imgproc::cvtBGR5x5toGray
+#undef cv_hal_cvtGraytoBGR5x5
+#define cv_hal_cvtGraytoBGR5x5 cv::rvv_hal::imgproc::cvtGraytoBGR5x5
+#undef cv_hal_cvtYUVtoBGR
+#define cv_hal_cvtYUVtoBGR cv::rvv_hal::imgproc::cvtYUVtoBGR
+#undef cv_hal_cvtBGRtoYUV
+#define cv_hal_cvtBGRtoYUV cv::rvv_hal::imgproc::cvtBGRtoYUV
+#undef cv_hal_cvtOnePlaneYUVtoBGR
+#define cv_hal_cvtOnePlaneYUVtoBGR cv::rvv_hal::imgproc::cvtOnePlaneYUVtoBGR
+#undef cv_hal_cvtTwoPlaneYUVtoBGR
+#define cv_hal_cvtTwoPlaneYUVtoBGR cv::rvv_hal::imgproc::cvtTwoPlaneYUVtoBGR
+#undef cv_hal_cvtThreePlaneYUVtoBGR
+#define cv_hal_cvtThreePlaneYUVtoBGR cv::rvv_hal::imgproc::cvtThreePlaneYUVtoBGR
+#undef cv_hal_cvtOnePlaneBGRtoYUV
+#define cv_hal_cvtOnePlaneBGRtoYUV cv::rvv_hal::imgproc::cvtOnePlaneBGRtoYUV
+#undef cv_hal_cvtBGRtoTwoPlaneYUV
+#define cv_hal_cvtBGRtoTwoPlaneYUV cv::rvv_hal::imgproc::cvtBGRtoTwoPlaneYUV
+#undef cv_hal_cvtBGRtoThreePlaneYUV
+#define cv_hal_cvtBGRtoThreePlaneYUV cv::rvv_hal::imgproc::cvtBGRtoThreePlaneYUV
+#undef cv_hal_cvtHSVtoBGR
+#define cv_hal_cvtHSVtoBGR cv::rvv_hal::imgproc::cvtHSVtoBGR
+#undef cv_hal_cvtBGRtoHSV
+#define cv_hal_cvtBGRtoHSV cv::rvv_hal::imgproc::cvtBGRtoHSV
+#undef cv_hal_cvtXYZtoBGR
+#define cv_hal_cvtXYZtoBGR cv::rvv_hal::imgproc::cvtXYZtoBGR
+#undef cv_hal_cvtBGRtoXYZ
+#define cv_hal_cvtBGRtoXYZ cv::rvv_hal::imgproc::cvtBGRtoXYZ
+#undef cv_hal_cvtLabtoBGR
+#define cv_hal_cvtLabtoBGR cv::rvv_hal::imgproc::cvtLabtoBGR
+#undef cv_hal_cvtBGRtoLab
+#define cv_hal_cvtBGRtoLab cv::rvv_hal::imgproc::cvtBGRtoLab
+
+/* ############ warp ############ */
+
+int remap32f(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height,
+             uchar *dst_data, size_t dst_step, int dst_width, int dst_height,
+             float* mapx, size_t mapx_step, float* mapy, size_t mapy_step,
+             int interpolation, int border_type, const double border_value[4]);
+int remap32fc2(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height,
+               uchar *dst_data, size_t dst_step, int dst_width, int dst_height,
+               float* map, size_t map_step, int interpolation, int border_type, const double border_value[4]);
+int remap16s(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height,
+             uchar *dst_data, size_t dst_step, int dst_width, int dst_height,
+             short* mapx, size_t mapx_step, ushort* mapy, size_t mapy_step,
+             int interpolation, int border_type, const double border_value[4]);
+
+// BUG: https://github.com/opencv/opencv/issues/27279
+// #undef cv_hal_remap32f
+// #define cv_hal_remap32f cv::cv_hal_rvv::imgproc::remap32f
+// #undef cv_hal_remap32fc2
+// #define cv_hal_remap32fc2 cv::cv_hal_rvv::imgproc::remap32fc2
+// #undef cv_hal_remap16s
+// #define cv_hal_remap16s cv::cv_hal_rvv::imgproc::remap16s
+
+int warpAffine(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, int dst_height, const double M[6], int interpolation, int borderType, const double borderValue[4]);
+int warpPerspective(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, int dst_height, const double M[9], int interpolation, int borderType, const double borderValue[4]);
+
+// BUG: https://github.com/opencv/opencv/issues/27280
+//#undef cv_hal_warpAffine
+//#define cv_hal_warpAffine cv::cv_hal_rvv::imgproc::warpAffine
+//#undef cv_hal_warpPerspective
+//#define cv_hal_warpPerspective cv::cv_hal_rvv::imgproc::warpPerspective
+
+/* ############ threshold ############ */
+
+int threshold(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int depth, int cn, double thresh, double maxValue, int thresholdType);
+int threshold_otsu(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int depth, double maxValue, int thresholdType, double* thresh);
+int adaptiveThreshold(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, double maxValue, int adaptiveMethod, int thresholdType, int blockSize, double C);
+
+// disabled since UI is fast enough, only called in threshold_otsu
+// #undef cv_hal_threshold
+// #define cv_hal_threshold cv::rvv_hal::imgproc::threshold
+#undef cv_hal_threshold_otsu
+#define cv_hal_threshold_otsu cv::rvv_hal::imgproc::threshold_otsu
+#undef cv_hal_adaptiveThreshold
+#define cv_hal_adaptiveThreshold cv::rvv_hal::imgproc::adaptiveThreshold
+
+/* ############ histogram ############ */
+
+int equalize_hist(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height);
+
+#undef cv_hal_equalize_hist
+#define cv_hal_equalize_hist cv::rvv_hal::imgproc::equalize_hist
+
+int calc_hist(const uchar* src_data, size_t src_step, int src_type, int src_width, int src_height, float* hist_data, int hist_size, const float** ranges, bool uniform, bool accumulate);
+
+#undef cv_hal_calcHist
+#define cv_hal_calcHist cv::rvv_hal::imgproc::calc_hist
+
+/* ############ resize ############ */
+
+int resize(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, int dst_height, double inv_scale_x, double inv_scale_y, int interpolation);
+
+#undef cv_hal_resize
+#define cv_hal_resize cv::rvv_hal::imgproc::resize
+
+/* ############ resize ############ */
+
+int integral(int depth, int sdepth, int sqdepth,
+             const uchar* src_data, size_t src_step,
+             uchar* sum_data, size_t sum_step,
+             uchar* sqsum_data, size_t sqsum_step,
+             uchar* tilted_data, [[maybe_unused]] size_t tilted_step,
+             int width, int height, int cn);
+
+#undef cv_hal_integral
+#define cv_hal_integral cv::rvv_hal::imgproc::integral
+
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+#if CV_HAL_RVV_071_ENABLED
+
+int cvtBGRtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, int dcn, bool swapBlue);
+#undef cv_hal_cvtBGRtoBGR
+#define cv_hal_cvtBGRtoBGR cv::rvv_hal::imgproc::cvtBGRtoBGR
+
+#endif // CV_HAL_RVV_071_ENABLED
+
+}}} // cv::rvv_hal::imgproc
+
+#endif // OPENCV_RVV_HAL_IMGPROC_HPP
diff --git a/hal/riscv-rvv/hal_rvv_1p0/types.hpp b/hal/riscv-rvv/include/types.hpp
similarity index 99%
rename from hal/riscv-rvv/hal_rvv_1p0/types.hpp
rename to hal/riscv-rvv/include/types.hpp
index 6613a018fc..948bbfbd30 100644
--- a/hal/riscv-rvv/hal_rvv_1p0/types.hpp
+++ b/hal/riscv-rvv/include/types.hpp
@@ -4,13 +4,15 @@
 
 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
 
-#ifndef OPENCV_HAL_RVV_TYPES_HPP_INCLUDED
-#define OPENCV_HAL_RVV_TYPES_HPP_INCLUDED
+#ifndef OPENCV_RVV_HAL_TYPES_HPP
+#define OPENCV_RVV_HAL_TYPES_HPP
 
 #include <riscv_vector.h>
 #include <type_traits>
 
-namespace cv { namespace cv_hal_rvv {
+namespace cv { namespace rvv_hal {
+
+#if CV_HAL_RVV_1P0_ENABLED
 
 enum RVV_LMUL
 {
@@ -869,6 +871,8 @@ HAL_RVV_GROUP(RVV_F64M1, RVV_F64M8, f64, m1, m8)
 
 #undef HAL_RVV_GROUP
 
-}}  // namespace cv::cv_hal_rvv
+#endif // CV_HAL_RVV_1P0_ENABLED
 
-#endif //OPENCV_HAL_RVV_TYPES_HPP_INCLUDED
+}}  // namespace cv::rvv_hal
+
+#endif //OPENCV_RVV_HAL_TYPES_HPP
diff --git a/hal/riscv-rvv/rvv_hal.hpp b/hal/riscv-rvv/rvv_hal.hpp
new file mode 100644
index 0000000000..88989aaeb8
--- /dev/null
+++ b/hal/riscv-rvv/rvv_hal.hpp
@@ -0,0 +1,31 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_HAL_RVV_HPP_INCLUDED
+#define OPENCV_HAL_RVV_HPP_INCLUDED
+
+#include "opencv2/core/base.hpp"
+#include "opencv2/core/utility.hpp"
+#include "opencv2/core/hal/interface.h"
+
+#if defined(__riscv_v) && __riscv_v == 1000000
+#define CV_HAL_RVV_1P0_ENABLED 1
+#else
+#define CV_HAL_RVV_1P0_ENABLED 0
+#endif
+
+#if defined(__riscv_v) && __riscv_v == 7000 && defined(__GNUC__) && __GNUC__ == 10 && __GNUC_MINOR__ == 4 && defined(__THEAD_VERSION__)
+#define CV_HAL_RVV_071_ENABLED 1
+#else
+#define CV_HAL_RVV_071_ENABLED 0
+#endif
+
+#if CV_HAL_RVV_1P0_ENABLED || CV_HAL_RVV_071_ENABLED
+#include <riscv_vector.h>
+#endif
+#include "include/types.hpp"
+#include "include/core.hpp"
+#include "include/imgproc.hpp"
+
+#endif // OPENCV_HAL_RVV_HPP_INCLUDED
diff --git a/hal/riscv-rvv/src/core/atan.cpp b/hal/riscv-rvv/src/core/atan.cpp
new file mode 100644
index 0000000000..e2b0d5c314
--- /dev/null
+++ b/hal/riscv-rvv/src/core/atan.cpp
@@ -0,0 +1,64 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level
+// directory of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
+
+#include "rvv_hal.hpp"
+#include "common.hpp"
+
+namespace cv { namespace rvv_hal { namespace core {
+
+#if CV_HAL_RVV_1P0_ENABLED
+
+int fast_atan_32(const float* y, const float* x, float* dst, size_t n, bool angle_in_deg)
+{
+    auto atan_params = angle_in_deg ? common::atan_params_deg : common::atan_params_rad;
+
+    for (size_t vl = 0; n > 0; n -= vl)
+    {
+        vl = __riscv_vsetvl_e32m4(n);
+
+        auto vy = __riscv_vle32_v_f32m4(y, vl);
+        auto vx = __riscv_vle32_v_f32m4(x, vl);
+
+        auto a = common::rvv_atan(vy, vx, vl, atan_params);
+
+        __riscv_vse32(dst, a, vl);
+
+        x += vl;
+        y += vl;
+        dst += vl;
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+int fast_atan_64(const double* y, const double* x, double* dst, size_t n, bool angle_in_deg)
+{
+    // this also uses float32 version, ref: mathfuncs_core.simd.hpp
+
+    auto atan_params = angle_in_deg ? common::atan_params_deg : common::atan_params_rad;
+
+    for (size_t vl = 0; n > 0; n -= vl)
+    {
+        vl = __riscv_vsetvl_e64m8(n);
+
+        auto vy = __riscv_vfncvt_f(__riscv_vle64_v_f64m8(y, vl), vl);
+        auto vx = __riscv_vfncvt_f(__riscv_vle64_v_f64m8(x, vl), vl);
+
+        auto a = common::rvv_atan(vy, vx, vl, atan_params);
+
+        __riscv_vse64(dst, __riscv_vfwcvt_f(a, vl), vl);
+
+        x += vl;
+        y += vl;
+        dst += vl;
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+}}} // cv::rvv_hal::core
diff --git a/hal/riscv-rvv/hal_rvv_1p0/cart_to_polar.hpp b/hal/riscv-rvv/src/core/cart_to_polar.cpp
similarity index 53%
rename from hal/riscv-rvv/hal_rvv_1p0/cart_to_polar.hpp
rename to hal/riscv-rvv/src/core/cart_to_polar.cpp
index 676133b668..56ee0fcefc 100644
--- a/hal/riscv-rvv/hal_rvv_1p0/cart_to_polar.hpp
+++ b/hal/riscv-rvv/src/core/cart_to_polar.cpp
@@ -4,27 +4,20 @@
 
 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
 
-#ifndef OPENCV_HAL_RVV_CART_TO_POLAR_HPP_INCLUDED
-#define OPENCV_HAL_RVV_CART_TO_POLAR_HPP_INCLUDED
+#include "rvv_hal.hpp"
+#include "common.hpp"
 
-#include <riscv_vector.h>
+namespace cv { namespace rvv_hal { namespace core {
 
-#include "hal_rvv_1p0/atan.hpp"
-#include "hal_rvv_1p0/sqrt.hpp"
-#include "hal_rvv_1p0/types.hpp"
+#if CV_HAL_RVV_1P0_ENABLED
 
-namespace cv { namespace cv_hal_rvv {
-
-#undef cv_hal_cartToPolar32f
-#define cv_hal_cartToPolar32f cv::cv_hal_rvv::cartToPolar<cv::cv_hal_rvv::RVV_F32M4>
-#undef cv_hal_cartToPolar64f
-#define cv_hal_cartToPolar64f cv::cv_hal_rvv::cartToPolar<cv::cv_hal_rvv::RVV_F64M8>
+namespace {
 
 template <typename RVV_T, typename T = typename RVV_T::ElemType>
 inline int cartToPolar(const T* x, const T* y, T* mag, T* angle, int len, bool angleInDegrees)
 {
     using CalType = RVV_SameLen<float, RVV_T>;
-    auto atan_params = angleInDegrees ? detail::atan_params_deg : detail::atan_params_rad;
+    auto atan_params = angleInDegrees ? common::atan_params_deg : common::atan_params_rad;
     size_t vl;
     for (; len > 0; len -= (int)vl, x += vl, y += vl, mag += vl, angle += vl)
     {
@@ -33,16 +26,25 @@ inline int cartToPolar(const T* x, const T* y, T* mag, T* angle, int len, bool a
         auto vx = CalType::cast(RVV_T::vload(x, vl), vl);
         auto vy = CalType::cast(RVV_T::vload(y, vl), vl);
 
-        auto vmag = detail::sqrt<2>(__riscv_vfmadd(vx, vx, __riscv_vfmul(vy, vy, vl), vl), vl);
+        auto vmag = common::sqrt<2>(__riscv_vfmadd(vx, vx, __riscv_vfmul(vy, vy, vl), vl), vl);
         RVV_T::vstore(mag, RVV_T::cast(vmag, vl), vl);
 
-        auto vangle = detail::rvv_atan(vy, vx, vl, atan_params);
+        auto vangle = common::rvv_atan(vy, vx, vl, atan_params);
         RVV_T::vstore(angle, RVV_T::cast(vangle, vl), vl);
     }
 
     return CV_HAL_ERROR_OK;
 }
 
-}}  // namespace cv::cv_hal_rvv
+} // anonymous
 
-#endif  // OPENCV_HAL_RVV_CART_TO_POLAR_HPP_INCLUDED
+int cartToPolar32f(const float* x, const float* y, float* mag, float* angle, int len, bool angleInDegrees) {
+    return cartToPolar<RVV_F32M4>(x, y, mag, angle, len, angleInDegrees);
+}
+int cartToPolar64f(const double* x, const double* y, double* mag, double* angle, int len, bool angleInDegrees) {
+    return cartToPolar<RVV_F64M8>(x, y, mag, angle, len, angleInDegrees);
+}
+
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+}}}  // cv::rvv_hal::core
diff --git a/hal/riscv-rvv/hal_rvv_1p0/cholesky.hpp b/hal/riscv-rvv/src/core/cholesky.cpp
similarity index 88%
rename from hal/riscv-rvv/hal_rvv_1p0/cholesky.hpp
rename to hal/riscv-rvv/src/core/cholesky.cpp
index b5d9d3e891..995e7eb5be 100644
--- a/hal/riscv-rvv/hal_rvv_1p0/cholesky.hpp
+++ b/hal/riscv-rvv/src/core/cholesky.cpp
@@ -4,20 +4,15 @@
 
 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
 
-#ifndef OPENCV_HAL_RVV_CHOLESKY_HPP_INCLUDED
-#define OPENCV_HAL_RVV_CHOLESKY_HPP_INCLUDED
-
+#include "rvv_hal.hpp"
 #include <cmath>
 #include <limits>
-#include <riscv_vector.h>
-#include "hal_rvv_1p0/types.hpp"
 
-namespace cv { namespace cv_hal_rvv { namespace cholesky {
+namespace cv { namespace rvv_hal { namespace core {
 
-#undef cv_hal_Cholesky32f
-#define cv_hal_Cholesky32f cv::cv_hal_rvv::cholesky::Cholesky<cv::cv_hal_rvv::RVV_F32M4>
-#undef cv_hal_Cholesky64f
-#define cv_hal_Cholesky64f cv::cv_hal_rvv::cholesky::Cholesky<cv::cv_hal_rvv::RVV_F64M4>
+#if CV_HAL_RVV_1P0_ENABLED
+
+namespace {
 
 // the algorithm is copied from core/src/matrix_decomp.cpp,
 // in the function template static int cv::CholImpl
@@ -119,6 +114,15 @@ inline int Cholesky(T* src1, size_t src1_step, int m, T* src2, size_t src2_step,
     return CV_HAL_ERROR_OK;
 }
 
-}}}
+} // anonymous
 
-#endif
+int Cholesky32f(float* src1, size_t src1_step, int m, float* src2, size_t src2_step, int n, bool* info) {
+    return Cholesky<RVV_F32M4>(src1, src1_step, m, src2, src2_step, n, info);
+}
+int Cholesky64f(double* src1, size_t src1_step, int m, double* src2, size_t src2_step, int n, bool* info) {
+    return Cholesky<RVV_F64M4>(src1, src1_step, m, src2, src2_step, n, info);
+}
+
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+}}} // cv::rvv_hal::core
diff --git a/hal/riscv-rvv/src/core/common.hpp b/hal/riscv-rvv/src/core/common.hpp
new file mode 100644
index 0000000000..8ee43a984d
--- /dev/null
+++ b/hal/riscv-rvv/src/core/common.hpp
@@ -0,0 +1,195 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2025, SpaceMIT Inc., all rights reserved.
+// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
+// Third party copyrights are property of their respective owners.
+
+#ifndef OPENCV_HAL_RVV_CORE_COMMON_HPP_INCLUDED
+#define OPENCV_HAL_RVV_CORE_COMMON_HPP_INCLUDED
+
+#include <riscv_vector.h>
+#include <cmath>
+#include <cfloat>
+
+namespace cv { namespace rvv_hal { namespace core { namespace common {
+
+#if CV_HAL_RVV_1P0_ENABLED
+
+#define CV_HAL_RVV_NOOP(a) (a)
+
+// ############ abs ############
+
+#define CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(_Tpvs, _Tpvd, shift, suffix) \
+    inline _Tpvd __riscv_vabs(const _Tpvs& v, const int vl) { \
+        _Tpvs mask = __riscv_vsra(v, shift, vl); \
+        _Tpvs v_xor = __riscv_vxor(v, mask, vl); \
+        return __riscv_vreinterpret_##suffix( \
+            __riscv_vsub(v_xor, mask, vl) \
+        ); \
+    }
+
+CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint8m2_t,  vuint8m2_t,  7,  u8m2)
+CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint8m8_t,  vuint8m8_t,  7,  u8m8)
+CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint16m4_t, vuint16m4_t, 15, u16m4)
+CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint16m8_t, vuint16m8_t, 15, u16m8)
+CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint32m4_t, vuint32m4_t, 31, u32m4)
+CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint32m8_t, vuint32m8_t, 31, u32m8)
+
+// ############ absdiff ############
+
+#define CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(_Tpvs, _Tpvd, cast, sub, max, min) \
+    inline _Tpvd __riscv_vabd(const _Tpvs& v1, const _Tpvs& v2, const int vl) { \
+        return cast(__riscv_##sub(__riscv_##max(v1, v2, vl), __riscv_##min(v1, v2, vl), vl)); \
+    }
+
+CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vuint8m4_t, vuint8m4_t, CV_HAL_RVV_NOOP, vsub, vmaxu, vminu)
+CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vuint8m8_t, vuint8m8_t, CV_HAL_RVV_NOOP, vsub, vmaxu, vminu)
+CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vuint16m2_t, vuint16m2_t, CV_HAL_RVV_NOOP, vsub, vmaxu, vminu)
+CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vuint16m8_t, vuint16m8_t, CV_HAL_RVV_NOOP, vsub, vmaxu, vminu)
+
+CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint8m4_t, vuint8m4_t, __riscv_vreinterpret_u8m4, vsub, vmax, vmin)
+CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint8m8_t, vuint8m8_t, __riscv_vreinterpret_u8m8, vsub, vmax, vmin)
+CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint16m2_t, vuint16m2_t, __riscv_vreinterpret_u16m2, vsub, vmax, vmin)
+CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint16m8_t, vuint16m8_t, __riscv_vreinterpret_u16m8, vsub, vmax, vmin)
+CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint32m4_t, vuint32m4_t, __riscv_vreinterpret_u32m4, vsub, vmax, vmin)
+CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint32m8_t, vuint32m8_t, __riscv_vreinterpret_u32m8, vsub, vmax, vmin)
+
+// ############ reciprocal ############
+
+inline vfloat32m4_t __riscv_vfrec(const vfloat32m4_t &x, const int vl) {
+    auto rec = __riscv_vfrec7(x, vl);
+    auto cls = __riscv_vfclass(rec, vl);
+    auto m = __riscv_vmseq(__riscv_vand(cls, 0b10111000, vl), 0, vl);
+    auto two = __riscv_vfmv_v_f_f32m4(2.f, vl);
+    rec = __riscv_vfmul_mu(m, rec, __riscv_vfnmsac(two, x, rec, vl), rec, vl);
+    rec = __riscv_vfmul_mu(m, rec, __riscv_vfnmsac(two, x, rec, vl), rec, vl);
+    return rec;
+}
+
+// ############ atan ############
+
+// ref: mathfuncs_core.simd.hpp
+static constexpr float pi = CV_PI;
+
+struct AtanParams
+{
+    float p1, p3, p5, p7, angle_90;
+};
+
+static constexpr AtanParams atan_params_rad {
+    0.9997878412794807F,
+    -0.3258083974640975F,
+    0.1555786518463281F,
+    -0.04432655554792128F,
+    90.F * (pi / 180.F)};
+static constexpr AtanParams atan_params_deg {
+    atan_params_rad.p1 * (180 / pi),
+    atan_params_rad.p3 * (180 / pi),
+    atan_params_rad.p5 * (180 / pi),
+    atan_params_rad.p7 * (180 / pi),
+    90.F};
+
+template <typename VEC_T>
+__attribute__((always_inline)) inline VEC_T
+    rvv_atan(VEC_T vy, VEC_T vx, size_t vl, const AtanParams& params)
+{
+    const auto ax = __riscv_vfabs(vx, vl);
+    const auto ay = __riscv_vfabs(vy, vl);
+    // Reciprocal Estimate (vfrec7) is not accurate enough to pass the test of cartToPolar.
+    const auto c = __riscv_vfdiv(__riscv_vfmin(ax, ay, vl),
+                                 __riscv_vfadd(__riscv_vfmax(ax, ay, vl), FLT_EPSILON, vl),
+                                 vl);
+    const auto c2 = __riscv_vfmul(c, c, vl);
+
+    // Using vfmadd only results in about a 2% performance improvement, but it occupies 3 additional
+    // M4 registers. (Performance test on phase32f::VectorLength::1048576: time decreased
+    // from 5.952ms to 5.805ms on Muse Pi)
+    // Additionally, when registers are nearly fully utilized (though not yet exhausted), the
+    // compiler is likely to fail to optimize and may introduce slower memory access (e.g., in
+    // cv::rvv_hal::fast_atan_64).
+    // Saving registers can also make this function more reusable in other contexts.
+    // Therefore, vfmadd is not used here.
+    auto a = __riscv_vfadd(__riscv_vfmul(c2, params.p7, vl), params.p5, vl);
+    a = __riscv_vfadd(__riscv_vfmul(c2, a, vl), params.p3, vl);
+    a = __riscv_vfadd(__riscv_vfmul(c2, a, vl), params.p1, vl);
+    a = __riscv_vfmul(a, c, vl);
+
+    a = __riscv_vfrsub_mu(__riscv_vmflt(ax, ay, vl), a, a, params.angle_90, vl);
+    a = __riscv_vfrsub_mu(__riscv_vmflt(vx, 0.F, vl), a, a, params.angle_90 * 2, vl);
+    a = __riscv_vfrsub_mu(__riscv_vmflt(vy, 0.F, vl), a, a, params.angle_90 * 4, vl);
+
+    return a;
+}
+
+// ############ sqrt ############
+
+template <typename RVV_T>
+struct Sqrt32f
+{
+    using T = RVV_T;
+    static constexpr size_t iter_times = 2;
+};
+
+template <typename RVV_T>
+struct Sqrt64f
+{
+    using T = RVV_T;
+    static constexpr size_t iter_times = 3;
+};
+
+// Newton-Raphson method
+// Use 4 LMUL registers
+template <size_t iter_times, typename VEC_T>
+inline VEC_T sqrt(VEC_T x, size_t vl)
+{
+    auto x2 = __riscv_vfmul(x, 0.5, vl);
+    auto y = __riscv_vfrsqrt7(x, vl);
+#ifdef __clang__
+#pragma unroll
+#endif
+    for (size_t i = 0; i < iter_times; i++)
+    {
+        auto t = __riscv_vfmul(y, y, vl);
+        t = __riscv_vfmul(t, x2, vl);
+        t = __riscv_vfrsub(t, 1.5, vl);
+        y = __riscv_vfmul(t, y, vl);
+    }
+    // just to prevent the compiler from calculating mask before the iteration, which will run out
+    // of registers and cause memory access.
+    asm volatile("" ::: "memory");
+    auto classified = __riscv_vfclass(x, vl);
+    // block -0, +0, positive subnormal number, +inf
+    auto mask = __riscv_vmseq(__riscv_vand(classified, 0b10111000, vl), 0, vl);
+    return __riscv_vfmul_mu(mask, x, x, y, vl);
+}
+
+// Newton-Raphson method
+// Use 3 LMUL registers and 1 mask register
+template <size_t iter_times, typename VEC_T>
+inline VEC_T invSqrt(VEC_T x, size_t vl)
+{
+    auto classified = __riscv_vfclass(x, vl);
+    // block -0, +0, positive subnormal number, +inf
+    auto mask = __riscv_vmseq(__riscv_vand(classified, 0b10111000, vl), 0, vl);
+    auto x2 = __riscv_vfmul(x, 0.5, vl);
+    auto y = __riscv_vfrsqrt7(x, vl);
+#ifdef __clang__
+#pragma unroll
+#endif
+    for (size_t i = 0; i < iter_times; i++)
+    {
+        auto t = __riscv_vfmul(y, y, vl);
+        t = __riscv_vfmul(t, x2, vl);
+        t = __riscv_vfrsub(t, 1.5, vl);
+        y = __riscv_vfmul_mu(mask, y, t, y, vl);
+    }
+    return y;
+}
+
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+}}}} // cv::rvv_hal::core::common
+
+#endif // OPENCV_HAL_RVV_CORE_COMMON_HPP_INCLUDED
diff --git a/hal/riscv-rvv/hal_rvv_1p0/compare.hpp b/hal/riscv-rvv/src/core/compare.cpp
similarity index 76%
rename from hal/riscv-rvv/hal_rvv_1p0/compare.hpp
rename to hal/riscv-rvv/src/core/compare.cpp
index 6efd92e18a..ccf0151afb 100644
--- a/hal/riscv-rvv/hal_rvv_1p0/compare.hpp
+++ b/hal/riscv-rvv/src/core/compare.cpp
@@ -5,12 +5,11 @@
 // Copyright (C) 2025, SpaceMIT Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 
-#ifndef OPENCV_HAL_RVV_COMPARE_HPP_INCLUDED
-#define OPENCV_HAL_RVV_COMPARE_HPP_INCLUDED
+#include "rvv_hal.hpp"
 
-#include "types.hpp"
+namespace cv { namespace rvv_hal { namespace core {
 
-namespace cv { namespace cv_hal_rvv { namespace compare {
+#if CV_HAL_RVV_1P0_ENABLED
 
 namespace {
 
@@ -90,23 +89,6 @@ int compare_impl(const _Tps *src1_data, size_t src1_step, const _Tps *src2_data,
     return CV_HAL_ERROR_OK;
 }
 
-} // anonymous
-
-#undef cv_hal_cmp8u
-#define cv_hal_cmp8u cv::cv_hal_rvv::compare::compare<uchar>
-#undef cv_hal_cmp8s
-#define cv_hal_cmp8s cv::cv_hal_rvv::compare::compare<schar>
-#undef cv_hal_cmp16u
-#define cv_hal_cmp16u cv::cv_hal_rvv::compare::compare<ushort>
-#undef cv_hal_cmp16s
-#define cv_hal_cmp16s cv::cv_hal_rvv::compare::compare<short>
-#undef cv_hal_cmp32s
-#define cv_hal_cmp32s cv::cv_hal_rvv::compare::compare<int>
-#undef cv_hal_cmp32f
-#define cv_hal_cmp32f cv::cv_hal_rvv::compare::compare<float>
-// #undef cv_hal_cmp64f
-// #define cv_hal_cmp64f cv::cv_hal_rvv::compare::compare<double>
-
 template <typename _Tps> inline
 int compare(const _Tps *src1_data, size_t src1_step, const _Tps *src2_data, size_t src2_step,
             uchar *dst_data, size_t dst_step, int width, int height, int operation) {
@@ -121,6 +103,27 @@ int compare(const _Tps *src1_data, size_t src1_step, const _Tps *src2_data, size
     }
 }
 
-}}} // cv::cv_hal_rvv::compare
+} // namespace anonymous
 
-#endif // OPENCV_HAL_RVV_COMPARE_HPP_INCLUDED
+int cmp8u(const uchar *src1_data, size_t src1_step, const uchar *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, int operation) {
+    return compare<uchar>(src1_data, src1_step, src2_data, src2_step, dst_data, dst_step, width, height, operation);
+}
+int cmp8s(const schar *src1_data, size_t src1_step, const schar *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, int operation) {
+    return compare<schar>(src1_data, src1_step, src2_data, src2_step, dst_data, dst_step, width, height, operation);
+}
+int cmp16u(const ushort *src1_data, size_t src1_step, const ushort *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, int operation) {
+    return compare<ushort>(src1_data, src1_step, src2_data, src2_step, dst_data, dst_step, width, height, operation);
+}
+int cmp16s(const short *src1_data, size_t src1_step, const short *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, int operation) {
+    return compare<short>(src1_data, src1_step, src2_data, src2_step, dst_data, dst_step, width, height, operation);
+}
+int cmp32s(const int *src1_data, size_t src1_step, const int *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, int operation) {
+    return compare<int>(src1_data, src1_step, src2_data, src2_step, dst_data, dst_step, width, height, operation);
+}
+int cmp32f(const float *src1_data, size_t src1_step, const float *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, int operation) {
+    return compare<float>(src1_data, src1_step, src2_data, src2_step, dst_data, dst_step, width, height, operation);
+}
+
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+}}} // cv::rvv_hal::core
diff --git a/hal/riscv-rvv/hal_rvv_1p0/convert_scale.hpp b/hal/riscv-rvv/src/core/convert_scale.cpp
similarity index 89%
rename from hal/riscv-rvv/hal_rvv_1p0/convert_scale.hpp
rename to hal/riscv-rvv/src/core/convert_scale.cpp
index 2f28f20bfd..8c5f83a677 100644
--- a/hal/riscv-rvv/hal_rvv_1p0/convert_scale.hpp
+++ b/hal/riscv-rvv/src/core/convert_scale.cpp
@@ -4,15 +4,11 @@
 
 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
 
-#ifndef OPENCV_HAL_RVV_CONVERT_SCALE_HPP_INCLUDED
-#define OPENCV_HAL_RVV_CONVERT_SCALE_HPP_INCLUDED
+#include "rvv_hal.hpp"
 
-#include <riscv_vector.h>
+namespace cv { namespace rvv_hal { namespace core {
 
-namespace cv { namespace cv_hal_rvv {
-
-#undef cv_hal_convertScale
-#define cv_hal_convertScale cv::cv_hal_rvv::convertScale
+#if CV_HAL_RVV_1P0_ENABLED
 
 inline int convertScale_8U8U(const uchar* src, size_t src_step, uchar* dst, size_t dst_step, int width, int height, double alpha, double beta)
 {
@@ -89,8 +85,8 @@ inline int convertScale_32F32F(const uchar* src, size_t src_step, uchar* dst, si
     return CV_HAL_ERROR_OK;
 }
 
-inline int convertScale(const uchar* src, size_t src_step, uchar* dst, size_t dst_step, int width, int height,
-                        int sdepth, int ddepth, double alpha, double beta)
+int convertScale(const uchar* src, size_t src_step, uchar* dst, size_t dst_step,
+                 int width, int height, int sdepth, int ddepth, double alpha, double beta)
 {
     if (!dst)
         return CV_HAL_ERROR_OK;
@@ -118,6 +114,6 @@ inline int convertScale(const uchar* src, size_t src_step, uchar* dst, size_t ds
     return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
 
-}}
+#endif // CV_HAL_RVV_1P0_ENABLED
 
-#endif
+}}} // cv::rvv_hal::core
diff --git a/hal/riscv-rvv/hal_rvv_1p0/copy_mask.hpp b/hal/riscv-rvv/src/core/copy_mask.cpp
similarity index 57%
rename from hal/riscv-rvv/hal_rvv_1p0/copy_mask.hpp
rename to hal/riscv-rvv/src/core/copy_mask.cpp
index f13b8bc22e..77dc648779 100644
--- a/hal/riscv-rvv/hal_rvv_1p0/copy_mask.hpp
+++ b/hal/riscv-rvv/src/core/copy_mask.cpp
@@ -5,21 +5,17 @@
 // Copyright (C) 2025, SpaceMIT Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 
-#ifndef OPENCV_HAL_RVV_COPY_MASK_HPP_INCLUDED
-#define OPENCV_HAL_RVV_COPY_MASK_HPP_INCLUDED
+#include "rvv_hal.hpp"
 
-#include <riscv_vector.h>
+namespace cv { namespace rvv_hal { namespace core {
 
-namespace cv { namespace cv_hal_rvv {
-
-#undef cv_hal_copyToMasked
-#define cv_hal_copyToMasked cv::cv_hal_rvv::copyToMasked
+#if CV_HAL_RVV_1P0_ENABLED
 
 namespace {
 
 #define CV_HAL_RVV_COPY_MASK_eXc1(X, mask_lmul) \
 static int copyToMasked_e##X##c1(const uchar *src_data, size_t src_step, const uchar *mask_data, size_t mask_step, \
-                                      uchar *dst_data, size_t dst_step, int width, int height) { \
+                                 uchar *dst_data, size_t dst_step, int width, int height) { \
     for (; height--; mask_data += mask_step, src_data += src_step, dst_data += dst_step) { \
         const uint##X##_t *src = (const uint##X##_t*)src_data; \
         uint##X##_t *dst = (uint##X##_t*)dst_data; \
@@ -41,7 +37,7 @@ CV_HAL_RVV_COPY_MASK_eXc1(64, 1)
 
 #define CV_HAL_RVV_COPY_MASK_eXc3(X, mask_lmul) \
 static int copyToMasked_e##X##c3(const uchar *src_data, size_t src_step, const uchar *mask_data, size_t mask_step, \
-                                      uchar *dst_data, size_t dst_step, int width, int height) { \
+                                 uchar *dst_data, size_t dst_step, int width, int height) { \
     for (; height--; mask_data += mask_step, src_data += src_step, dst_data += dst_step) { \
         const uint##X##_t *src = (const uint##X##_t*)src_data; \
         uint##X##_t *dst = (uint##X##_t*)dst_data; \
@@ -62,9 +58,9 @@ CV_HAL_RVV_COPY_MASK_eXc3(32, f2)
 CV_HAL_RVV_COPY_MASK_eXc3(64, f4)
 
 static int copyToMasked_e64c2(const uchar *src_data, size_t src_step,
-                                   const uchar *mask_data, size_t mask_step,
-                                   uchar *dst_data, size_t dst_step, int width,
-                                   int height) {
+                              const uchar *mask_data, size_t mask_step,
+                              uchar *dst_data, size_t dst_step, int width,
+                              int height) {
     for (; height--; mask_data += mask_step, src_data += src_step, dst_data += dst_step) {
         const uint64_t *src = (const uint64_t *)src_data;
         uint64_t *dst = (uint64_t *)dst_data;
@@ -80,9 +76,9 @@ static int copyToMasked_e64c2(const uchar *src_data, size_t src_step,
 }
 
 static int copyToMasked_e64c4(const uchar *src_data, size_t src_step,
-                                   const uchar *mask_data, size_t mask_step,
-                                   uchar *dst_data, size_t dst_step, int width,
-                                   int height) {
+                              const uchar *mask_data, size_t mask_step,
+                              uchar *dst_data, size_t dst_step, int width,
+                              int height) {
     for (; height--; mask_data += mask_step, src_data += src_step, dst_data += dst_step) {
         const uint64_t *src = (const uint64_t *)src_data;
         uint64_t *dst = (uint64_t *)dst_data;
@@ -100,71 +96,35 @@ static int copyToMasked_e64c4(const uchar *src_data, size_t src_step,
 } // anonymous
 
 using CopyToMaskedFunc = int (*)(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int);
-inline int copyToMasked(const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step, int width, int height,
-                      int type, const uchar *mask_data, size_t mask_step, int mask_type) {
-    int depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
+int copyToMasked(const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step, int width, int height,
+                 int type, const uchar *mask_data, size_t mask_step, int mask_type) {
+    int cn = CV_MAT_CN(type);
     int mdepth = CV_MAT_DEPTH(mask_type), mcn = CV_MAT_CN(mask_type);
 
     if (mcn > 1 || mdepth != CV_8U) {
         return CV_HAL_ERROR_NOT_IMPLEMENTED;
     }
 
-    CopyToMaskedFunc func = nullptr;
-    switch (depth) {
-        case CV_8U: {}
-        case CV_8S: switch (cn) {
-            case 1: func = copyToMasked_e8c1;  break;
-            case 2: func = copyToMasked_e16c1; break;
-            case 3: func = copyToMasked_e8c3;  break;
-            case 4: func = copyToMasked_e32c1; break;
-            case 6: func = copyToMasked_e16c3; break;
-            case 8: func = copyToMasked_e64c1; break;
-            default: func = nullptr;
-        }; break;
-        case CV_16U: {}
-        case CV_16S: switch (cn) {
-            case 1: func = copyToMasked_e16c1; break;
-            case 2: func = copyToMasked_e32c1; break;
-            case 3: func = copyToMasked_e16c3; break;
-            case 4: func = copyToMasked_e64c1; break;
-            case 6: func = copyToMasked_e32c3; break;
-            case 8: func = copyToMasked_e64c2; break;
-            default: func = nullptr; break;
-        }; break;
-        case CV_32S: {}
-        case CV_32F: switch (cn) {
-            case 1: func = copyToMasked_e32c1; break;
-            case 2: func = copyToMasked_e64c1; break;
-            case 3: func = copyToMasked_e32c3; break;
-            case 4: func = copyToMasked_e64c2; break;
-            case 6: func = copyToMasked_e64c3; break;
-            case 8: func = copyToMasked_e64c4; break;
-            default: func = nullptr; break;
-        }; break;
-        case CV_64F: switch (cn) {
-            case 1: func = copyToMasked_e64c1; break;
-            case 2: func = copyToMasked_e64c2; break;
-            case 3: func = copyToMasked_e64c3; break;
-            case 4: func = copyToMasked_e64c4; break;
-            default: func = nullptr; break;
-        }; break;
-        default: func = nullptr;
-    }
-
+    static CopyToMaskedFunc tab[] = {
+        0, copyToMasked_e8c1, copyToMasked_e16c1, copyToMasked_e8c3,
+        copyToMasked_e32c1, 0, copyToMasked_e16c3, 0,
+        copyToMasked_e64c1, 0, 0, 0,
+        copyToMasked_e32c3, 0, 0, 0,
+        copyToMasked_e64c2, 0, 0, 0,
+        0, 0, 0, 0,
+        copyToMasked_e64c3, 0, 0, 0,
+        0, 0, 0, 0,
+        copyToMasked_e64c4
+    };
+    size_t elem_size = CV_ELEM_SIZE(type);
+    CopyToMaskedFunc func = elem_size <= 32 ? tab[elem_size] : nullptr;
     if (func == nullptr) {
         return CV_HAL_ERROR_NOT_IMPLEMENTED;
     }
 
-    static const size_t elem_size_tab[CV_DEPTH_MAX] = {
-        sizeof(uchar),   sizeof(schar),
-        sizeof(ushort),  sizeof(short),
-        sizeof(int),     sizeof(float),
-        sizeof(int64_t), 0,
-    };
-    CV_Assert(elem_size_tab[depth]);
-
-    bool src_continuous = (src_step == width * elem_size_tab[depth] * cn || (src_step != width * elem_size_tab[depth] * cn && height == 1));
-    bool dst_continuous = (dst_step == width * elem_size_tab[depth] * cn || (dst_step != width * elem_size_tab[depth] * cn && height == 1));
+    size_t elem_size1 = static_cast<size_t>(CV_ELEM_SIZE1(type));
+    bool src_continuous = (src_step == width * elem_size1 * cn || (src_step != width * elem_size1 * cn && height == 1));
+    bool dst_continuous = (dst_step == width * elem_size1 * cn || (dst_step != width * elem_size1 * cn && height == 1));
     bool mask_continuous = (mask_step == static_cast<size_t>(width));
     size_t nplanes = 1;
     int _width = width, _height = height;
@@ -189,6 +149,6 @@ inline int copyToMasked(const uchar *src_data, size_t src_step, uchar *dst_data,
     return CV_HAL_ERROR_OK;
 }
 
-}} // cv::cv_hal_rvv
+#endif // CV_HAL_RVV_1P0_ENABLED
 
-#endif
+}}} // cv::rvv_hal::core
diff --git a/hal/riscv-rvv/src/core/div.cpp b/hal/riscv-rvv/src/core/div.cpp
new file mode 100644
index 0000000000..4af9ef2588
--- /dev/null
+++ b/hal/riscv-rvv/src/core/div.cpp
@@ -0,0 +1,299 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2025, SpaceMIT Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+
+#include "rvv_hal.hpp"
+#include "common.hpp"
+#include <limits>
+
+namespace cv { namespace rvv_hal { namespace core {
+
+#if CV_HAL_RVV_1P0_ENABLED
+
+namespace {
+
+inline size_t setvl(int l) { return __riscv_vsetvl_e8m1(l); }
+
+inline   vuint8m1_t vle(const uint8_t  *p, int vl) { return __riscv_vle8_v_u8m1(p, vl); }
+inline    vint8m1_t vle(const int8_t   *p, int vl) { return __riscv_vle8_v_i8m1(p, vl); }
+inline  vuint16m2_t vle(const uint16_t *p, int vl) { return __riscv_vle16_v_u16m2(p, vl); }
+inline   vint16m2_t vle(const int16_t  *p, int vl) { return __riscv_vle16_v_i16m2(p, vl); }
+inline   vint32m4_t vle(const int      *p, int vl) { return __riscv_vle32_v_i32m4(p, vl); }
+inline vfloat32m4_t vle(const float    *p, int vl) { return __riscv_vle32_v_f32m4(p, vl); }
+
+inline void vse(uint8_t  *p, const   vuint8m1_t &v, int vl) { __riscv_vse8(p, v, vl); }
+inline void vse(int8_t   *p, const    vint8m1_t &v, int vl) { __riscv_vse8(p, v, vl); }
+inline void vse(uint16_t *p, const  vuint16m2_t &v, int vl) { __riscv_vse16(p, v, vl); }
+inline void vse(int16_t  *p, const   vint16m2_t &v, int vl) { __riscv_vse16(p, v, vl); }
+inline void vse(int      *p, const   vint32m4_t &v, int vl) { __riscv_vse32(p, v, vl); }
+inline void vse(float    *p, const vfloat32m4_t &v, int vl) { __riscv_vse32(p, v, vl); }
+
+inline vuint16m2_t ext(const  vuint8m1_t &v, const int vl) { return __riscv_vzext_vf2(v, vl); }
+inline  vint16m2_t ext(const   vint8m1_t &v, const int vl) { return __riscv_vsext_vf2(v, vl); }
+
+inline  vuint8m1_t nclip(const vuint16m2_t &v, const int vl) { return __riscv_vnclipu(v, 0, __RISCV_VXRM_RNU, vl); }
+inline   vint8m1_t nclip(const  vint16m2_t &v, const int vl) { return __riscv_vnclip(v, 0, __RISCV_VXRM_RNU, vl); }
+
+template <typename VT> inline
+VT div_sat(const VT &v1, const VT &v2, const float scale, const int vl) {
+    return nclip(div_sat(ext(v1, vl), ext(v2, vl), scale, vl), vl);
+}
+template <> inline
+vint16m2_t div_sat(const vint16m2_t &v1, const vint16m2_t &v2, const float scale, const int vl) {
+    auto f1 = __riscv_vfwcvt_f(v1, vl);
+    auto f2 = __riscv_vfwcvt_f(v2, vl);
+    auto res = __riscv_vfmul(f1, __riscv_vfmul(common::__riscv_vfrec(f2, vl), scale, vl), vl);
+    return __riscv_vfncvt_x(res, vl);
+}
+template <> inline
+vuint16m2_t div_sat(const vuint16m2_t &v1, const vuint16m2_t &v2, const float scale, const int vl) {
+    auto f1 = __riscv_vfwcvt_f(v1, vl);
+    auto f2 = __riscv_vfwcvt_f(v2, vl);
+    auto res = __riscv_vfmul(f1, __riscv_vfmul(common::__riscv_vfrec(f2, vl), scale, vl), vl);
+    return __riscv_vfncvt_xu(res, vl);
+}
+template <> inline
+vint32m4_t div_sat(const vint32m4_t &v1, const vint32m4_t &v2, const float scale, const int vl) {
+    auto f1 = __riscv_vfcvt_f(v1, vl);
+    auto f2 = __riscv_vfcvt_f(v2, vl);
+    auto res = __riscv_vfmul(f1, __riscv_vfmul(common::__riscv_vfrec(f2, vl), scale, vl), vl);
+    return __riscv_vfcvt_x(res, vl);
+}
+template <> inline
+vuint32m4_t div_sat(const vuint32m4_t &v1, const vuint32m4_t &v2, const float scale, const int vl) {
+    auto f1 = __riscv_vfcvt_f(v1, vl);
+    auto f2 = __riscv_vfcvt_f(v2, vl);
+    auto res = __riscv_vfmul(f1, __riscv_vfmul(common::__riscv_vfrec(f2, vl), scale, vl), vl);
+    return __riscv_vfcvt_xu(res, vl);
+}
+
+template <typename VT> inline
+VT recip_sat(const VT &v, const float scale, const int vl) {
+    return nclip(recip_sat(ext(v, vl), scale, vl), vl);
+}
+template <> inline
+vint16m2_t recip_sat(const vint16m2_t &v, const float scale, const int vl) {
+    auto f = __riscv_vfwcvt_f(v, vl);
+    auto res = __riscv_vfmul(common::__riscv_vfrec(f, vl), scale, vl);
+    return __riscv_vfncvt_x(res, vl);
+}
+template <> inline
+vuint16m2_t recip_sat(const vuint16m2_t &v, const float scale, const int vl) {
+    auto f = __riscv_vfwcvt_f(v, vl);
+    auto res = __riscv_vfmul(common::__riscv_vfrec(f, vl), scale, vl);
+    return __riscv_vfncvt_xu(res, vl);
+}
+template <> inline
+vint32m4_t recip_sat(const vint32m4_t &v, const float scale, const int vl) {
+    auto f = __riscv_vfcvt_f(v, vl);
+    auto res = __riscv_vfmul(common::__riscv_vfrec(f, vl), scale, vl);
+    return __riscv_vfcvt_x(res, vl);
+}
+template <> inline
+vuint32m4_t recip_sat(const vuint32m4_t &v, const float scale, const int vl) {
+    auto f = __riscv_vfcvt_f(v, vl);
+    auto res = __riscv_vfmul(common::__riscv_vfrec(f, vl), scale, vl);
+    return __riscv_vfcvt_xu(res, vl);
+}
+
+// Implementation
+
+template <typename ST> inline
+int div(const ST *src1, size_t step1, const ST *src2, size_t step2,
+         ST *dst, size_t step, int width, int height, float scale) {
+    float max_fval = static_cast<float>(std::numeric_limits<ST>::max());
+    if (scale == 0.f || ((scale * max_fval) <  1.f && (scale * max_fval) > -1.f)) {
+        for (int h = 0; h < height; h++) {
+            ST *dst_h = reinterpret_cast<ST*>((uchar*)dst + h * step);
+            std::memset(dst_h, 0, sizeof(ST) * width);
+        }
+        return CV_HAL_ERROR_OK;
+    }
+
+    for (int h = 0; h < height; h++) {
+        const ST *src1_h = reinterpret_cast<const ST*>((const uchar*)src1 + h * step1);
+        const ST *src2_h = reinterpret_cast<const ST*>((const uchar*)src2 + h * step2);
+        ST *dst_h = reinterpret_cast<ST*>((uchar*)dst + h * step);
+
+        int vl;
+        for (int w = 0; w < width; w += vl) {
+            vl = setvl(width - w);
+
+            auto v1 = vle(src1_h + w, vl);
+            auto v2 = vle(src2_h + w, vl);
+
+            auto mask = __riscv_vmseq(v2, 0, vl);
+            vse(dst_h + w, __riscv_vmerge(div_sat(v1, v2, scale, vl), 0, mask, vl), vl);
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+template <>
+int div(const float *src1, size_t step1, const float *src2, size_t step2,
+        float *dst, size_t step, int width, int height, float scale) {
+    if (scale == 0.f) {
+        for (int h = 0; h < height; h++) {
+            float *dst_h = reinterpret_cast<float*>((uchar*)dst + h * step);
+            std::memset(dst_h, 0, sizeof(float) * width);
+        }
+        return CV_HAL_ERROR_OK;
+    }
+
+    if (std::fabs(scale - 1.f) < FLT_EPSILON) {
+        for (int h = 0; h < height; h++) {
+            const float *src1_h = reinterpret_cast<const float*>((const uchar*)src1 + h * step1);
+            const float *src2_h = reinterpret_cast<const float*>((const uchar*)src2 + h * step2);
+            float *dst_h = reinterpret_cast<float*>((uchar*)dst + h * step);
+
+            int vl;
+            for (int w = 0; w < width; w += vl) {
+                vl = setvl(width - w);
+
+                auto v1 = vle(src1_h + w, vl);
+                auto v2 = vle(src2_h + w, vl);
+
+                vse(dst_h + w, __riscv_vfmul(v1, common::__riscv_vfrec(v2, vl), vl), vl);
+            }
+        }
+    } else {
+        for (int h = 0; h < height; h++) {
+            const float *src1_h = reinterpret_cast<const float*>((const uchar*)src1 + h * step1);
+            const float *src2_h = reinterpret_cast<const float*>((const uchar*)src2 + h * step2);
+            float *dst_h = reinterpret_cast<float*>((uchar*)dst + h * step);
+
+            int vl;
+            for (int w = 0; w < width; w += vl) {
+                vl = setvl(width - w);
+
+                auto v1 = vle(src1_h + w, vl);
+                auto v2 = vle(src2_h + w, vl);
+
+                vse(dst_h + w, __riscv_vfmul(v1, __riscv_vfmul(common::__riscv_vfrec(v2, vl), scale, vl), vl), vl);
+            }
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+template <typename ST> inline
+int recip(const ST *src_data, size_t src_step, ST *dst_data, size_t dst_step,
+          int width, int height, float scale) {
+    if (scale == 0.f || (scale < 1.f && scale > -1.f)) {
+        for (int h = 0; h < height; h++) {
+            ST *dst_h = reinterpret_cast<ST*>((uchar*)dst_data + h * dst_step);
+            std::memset(dst_h, 0, sizeof(ST) * width);
+        }
+        return CV_HAL_ERROR_OK;
+    }
+
+    for (int h = 0; h < height; h++) {
+        const ST *src_h = reinterpret_cast<const ST*>((const uchar*)src_data + h * src_step);
+        ST *dst_h = reinterpret_cast<ST*>((uchar*)dst_data + h * dst_step);
+
+        int vl;
+        for (int w = 0; w < width; w += vl) {
+            vl = setvl(width - w);
+
+            auto v = vle(src_h + w, vl);
+
+            auto mask = __riscv_vmseq(v, 0, vl);
+            vse(dst_h + w, __riscv_vmerge(recip_sat(v, scale, vl), 0, mask, vl), vl);
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+template <>
+int recip(const float *src_data, size_t src_step, float *dst_data, size_t dst_step,
+          int width, int height, float scale) {
+    if (scale == 0.f) {
+        for (int h = 0; h < height; h++) {
+            float *dst_h = reinterpret_cast<float*>((uchar*)dst_data + h * dst_step);
+            std::memset(dst_h, 0, sizeof(float) * width);
+        }
+        return CV_HAL_ERROR_OK;
+    }
+
+    if (std::fabs(scale - 1.f) < FLT_EPSILON) {
+        for (int h = 0; h < height; h++) {
+            const float *src_h = reinterpret_cast<const float*>((const uchar*)src_data + h * src_step);
+            float *dst_h = reinterpret_cast<float*>((uchar*)dst_data + h * dst_step);
+
+            int vl;
+            for (int w = 0; w < width; w += vl) {
+                vl = setvl(width - w);
+
+                auto v = vle(src_h + w, vl);
+
+                vse(dst_h + w, common::__riscv_vfrec(v, vl), vl);
+            }
+        }
+    } else {
+        for (int h = 0; h < height; h++) {
+            const float *src_h = reinterpret_cast<const float*>((const uchar*)src_data + h * src_step);
+            float *dst_h = reinterpret_cast<float*>((uchar*)dst_data + h * dst_step);
+
+            int vl;
+            for (int w = 0; w < width; w += vl) {
+                vl = setvl(width - w);
+
+                auto v = vle(src_h + w, vl);
+
+                vse(dst_h + w, __riscv_vfmul(common::__riscv_vfrec(v, vl), scale, vl), vl);
+            }
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+} // anonymous
+
+int div8u(const uchar *src1_data, size_t src1_step, const uchar *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, double scale) {
+    return div<uchar>(src1_data, src1_step, src2_data, src2_step, dst_data, dst_step, width, height, scale);
+}
+int div8s(const schar *src1_data, size_t src1_step, const schar *src2_data, size_t src2_step, schar *dst_data, size_t dst_step, int width, int height, double scale) {
+    return div<schar>(src1_data, src1_step, src2_data, src2_step, dst_data, dst_step, width, height, scale);
+}
+int div16u(const ushort *src1_data, size_t src1_step, const ushort *src2_data, size_t src2_step, ushort *dst_data, size_t dst_step, int width, int height, double scale) {
+    return div<ushort>(src1_data, src1_step, src2_data, src2_step, dst_data, dst_step, width, height, scale);
+}
+int div16s(const short *src1_data, size_t src1_step, const short *src2_data, size_t src2_step, short *dst_data, size_t dst_step, int width, int height, double scale) {
+    return div<short>(src1_data, src1_step, src2_data, src2_step, dst_data, dst_step, width, height, scale);
+}
+int div32s(const int *src1_data, size_t src1_step, const int *src2_data, size_t src2_step, int *dst_data, size_t dst_step, int width, int height, double scale) {
+    return div<int>(src1_data, src1_step, src2_data, src2_step, dst_data, dst_step, width, height, scale);
+}
+int div32f(const float *src1_data, size_t src1_step, const float *src2_data, size_t src2_step, float *dst_data, size_t dst_step, int width, int height, double scale) {
+    return div<float>(src1_data, src1_step, src2_data, src2_step, dst_data, dst_step, width, height, scale);
+}
+
+int recip8u(const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step, int width, int height, double scale) {
+    return recip<uchar>(src_data, src_step, dst_data, dst_step, width, height, scale);
+}
+int recip8s(const schar *src_data, size_t src_step, schar *dst_data, size_t dst_step, int width, int height, double scale) {
+    return recip<schar>(src_data, src_step, dst_data, dst_step, width, height, scale);
+}
+int recip16u(const ushort *src_data, size_t src_step, ushort *dst_data, size_t dst_step, int width, int height, double scale) {
+    return recip<ushort>(src_data, src_step, dst_data, dst_step, width, height, scale);
+}
+int recip16s(const short *src_data, size_t src_step, short *dst_data, size_t dst_step, int width, int height, double scale) {
+    return recip<short>(src_data, src_step, dst_data, dst_step, width, height, scale);
+}
+int recip32s(const int *src_data, size_t src_step, int *dst_data, size_t dst_step, int width, int height, double scale) {
+    return recip<int>(src_data, src_step, dst_data, dst_step, width, height, scale);
+}
+int recip32f(const float *src_data, size_t src_step, float *dst_data, size_t dst_step, int width, int height, double scale) {
+    return recip<float>(src_data, src_step, dst_data, dst_step, width, height, scale);
+}
+
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+}}} // cv::rvv_hal::core
diff --git a/hal/riscv-rvv/hal_rvv_1p0/dotprod.hpp b/hal/riscv-rvv/src/core/dotprod.cpp
similarity index 81%
rename from hal/riscv-rvv/hal_rvv_1p0/dotprod.hpp
rename to hal/riscv-rvv/src/core/dotprod.cpp
index e16a97cf6a..11a44697de 100644
--- a/hal/riscv-rvv/hal_rvv_1p0/dotprod.hpp
+++ b/hal/riscv-rvv/src/core/dotprod.cpp
@@ -5,21 +5,16 @@
 // Copyright (C) 2025, SpaceMIT Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 
-
-#ifndef OPENCV_HAL_RVV_DOTPROD_HPP_INCLUDED
-#define OPENCV_HAL_RVV_DOTPROD_HPP_INCLUDED
-
-#include <riscv_vector.h>
+#include "rvv_hal.hpp"
 #include <algorithm>
 
-namespace cv { namespace cv_hal_rvv { namespace dotprod {
+namespace cv { namespace rvv_hal { namespace core {
 
-#undef cv_hal_dotProduct
-#define cv_hal_dotProduct cv::cv_hal_rvv::dotprod::dotprod
+#if CV_HAL_RVV_1P0_ENABLED
 
 namespace {
 
-double dotProd_8u(const uchar *a, const uchar *b, int len) {
+static inline double dotProd_8u(const uchar *a, const uchar *b, int len) {
     constexpr int block_size0 = (1 << 15);
 
     double r = 0;
@@ -47,7 +42,7 @@ double dotProd_8u(const uchar *a, const uchar *b, int len) {
     return r;
 }
 
-double dotProd_8s(const schar *a, const schar *b, int len) {
+static inline double dotProd_8s(const schar *a, const schar *b, int len) {
     constexpr int block_size0 = (1 << 14);
 
     double r = 0;
@@ -75,7 +70,7 @@ double dotProd_8s(const schar *a, const schar *b, int len) {
     return r;
 }
 
-double dotProd_16u(const ushort *a, const ushort *b, int len) {
+static inline double dotProd_16u(const ushort *a, const ushort *b, int len) {
     constexpr int block_size0 = (1 << 24);
 
     double r = 0;
@@ -103,7 +98,7 @@ double dotProd_16u(const ushort *a, const ushort *b, int len) {
     return r;
 }
 
-double dotProd_16s(const short *a, const short *b, int len) {
+static inline double dotProd_16s(const short *a, const short *b, int len) {
     constexpr int block_size0 = (1 << 24);
 
     double r = 0;
@@ -131,7 +126,7 @@ double dotProd_16s(const short *a, const short *b, int len) {
     return r;
 }
 
-double dotProd_32s(const int *a, const int *b, int len) {
+static inline double dotProd_32s(const int *a, const int *b, int len) {
     double r = 0;
 
     vfloat64m8_t s = __riscv_vfmv_v_f_f64m8(0.f, __riscv_vsetvlmax_e64m8());
@@ -149,7 +144,7 @@ double dotProd_32s(const int *a, const int *b, int len) {
     return r;
 }
 
-double dotProd_32f(const float *a, const float *b, int len) {
+static inline double dotProd_32f(const float *a, const float *b, int len) {
     constexpr int block_size0 = (1 << 11);
 
     double r = 0.f;
@@ -180,8 +175,8 @@ double dotProd_32f(const float *a, const float *b, int len) {
 } // anonymous
 
 using DotProdFunc = double (*)(const uchar *a, const uchar *b, int len);
-inline int dotprod(const uchar *a_data, size_t a_step, const uchar *b_data, size_t b_step,
-                   int width, int height, int type, double *dot_val) {
+int dotprod(const uchar *a_data, size_t a_step, const uchar *b_data, size_t b_step,
+            int width, int height, int type, double *dot_val) {
     int depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
 
     static DotProdFunc dotprod_tab[CV_DEPTH_MAX] = {
@@ -195,16 +190,9 @@ inline int dotprod(const uchar *a_data, size_t a_step, const uchar *b_data, size
         return CV_HAL_ERROR_NOT_IMPLEMENTED;
     }
 
-    static const size_t elem_size_tab[CV_DEPTH_MAX] = {
-        sizeof(uchar),   sizeof(schar),
-        sizeof(ushort),  sizeof(short),
-        sizeof(int),     sizeof(float),
-        sizeof(int64_t), 0,
-    };
-    CV_Assert(elem_size_tab[depth]);
-
-    bool a_continuous = (a_step == width * elem_size_tab[depth] * cn);
-    bool b_continuous = (b_step == width * elem_size_tab[depth] * cn);
+    size_t elem_size1 = static_cast<size_t>(CV_ELEM_SIZE1(type));
+    bool a_continuous = (a_step == width * elem_size1 * cn);
+    bool b_continuous = (b_step == width * elem_size1 * cn);
     size_t nplanes = 1;
     size_t len = width * height;
     if (!a_continuous || !b_continuous) {
@@ -228,6 +216,6 @@ inline int dotprod(const uchar *a_data, size_t a_step, const uchar *b_data, size
     return CV_HAL_ERROR_OK;
 }
 
-}}} // cv::cv_hal_rvv::dotprod
+#endif // CV_HAL_RVV_1P0_ENABLED
 
-#endif // OPENCV_HAL_RVV_DOTPROD_HPP_INCLUDED
+}}} // cv::rvv_hal::core
diff --git a/hal/riscv-rvv/hal_rvv_1p0/dxt.hpp b/hal/riscv-rvv/src/core/dxt.cpp
similarity index 97%
rename from hal/riscv-rvv/hal_rvv_1p0/dxt.hpp
rename to hal/riscv-rvv/src/core/dxt.cpp
index 25f4879532..fa0c464e88 100644
--- a/hal/riscv-rvv/hal_rvv_1p0/dxt.hpp
+++ b/hal/riscv-rvv/src/core/dxt.cpp
@@ -4,17 +4,11 @@
 
 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
 
-#ifndef OPENCV_HAL_RVV_DXT_HPP_INCLUDED
-#define OPENCV_HAL_RVV_DXT_HPP_INCLUDED
+#include "rvv_hal.hpp"
 
-#include <riscv_vector.h>
-#include "hal_rvv_1p0/types.hpp"
-#include "opencv2/core/types.hpp"
+namespace cv { namespace rvv_hal { namespace core {
 
-namespace cv { namespace cv_hal_rvv { namespace dxt {
-
-#undef cv_hal_dft
-#define cv_hal_dft cv::cv_hal_rvv::dxt::dft
+#if CV_HAL_RVV_1P0_ENABLED
 
 template<typename T> struct rvv;
 
@@ -42,7 +36,7 @@ template<> struct rvv<double> : RVV_F64M1
 // in the function template static void cv::DFT and cv::DFT_R2, cv::DFT_R3, cv::DFT_R5
 template <typename T>
 inline int dft(const Complex<T>* src, Complex<T>* dst, int nf, int *factors, T scale, int* itab,
-                  const Complex<T>* wave, int tab_size, int len, bool isInverse, bool noPermute)
+               const Complex<T>* wave, int tab_size, int len, bool isInverse, bool noPermute)
 {
     int n = len;
     int f_idx, nx;
@@ -545,8 +539,8 @@ inline int dft(const Complex<T>* src, Complex<T>* dst, int nf, int *factors, T s
     return CV_HAL_ERROR_OK;
 }
 
-inline int dft(const uchar* src, uchar* dst, int depth, int nf, int *factors, double scale, int* itab, void* wave,
-                  int tab_size, int n, bool isInverse, bool noPermute)
+int dft(const uchar* src, uchar* dst, int depth, int nf, int *factors, double scale,
+        int* itab, void* wave, int tab_size, int n, bool isInverse, bool noPermute)
 {
     if( n == 0 )
         return CV_HAL_ERROR_OK;
@@ -563,6 +557,6 @@ inline int dft(const uchar* src, uchar* dst, int depth, int nf, int *factors, do
     return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
 
-}}}
+#endif // CV_HAL_RVV_1P0_ENABLED
 
-#endif
+}}} // cv::rvv_hal::core
diff --git a/hal/riscv-rvv/hal_rvv_1p0/exp.hpp b/hal/riscv-rvv/src/core/exp.cpp
similarity index 95%
rename from hal/riscv-rvv/hal_rvv_1p0/exp.hpp
rename to hal/riscv-rvv/src/core/exp.cpp
index 82690fb321..552fdc0e3f 100644
--- a/hal/riscv-rvv/hal_rvv_1p0/exp.hpp
+++ b/hal/riscv-rvv/src/core/exp.cpp
@@ -4,17 +4,11 @@
 
 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
 
-#ifndef OPENCV_HAL_RVV_EXP_HPP_INCLUDED
-#define OPENCV_HAL_RVV_EXP_HPP_INCLUDED
+#include "rvv_hal.hpp"
 
-#include <riscv_vector.h>
+namespace cv { namespace rvv_hal { namespace core {
 
-namespace cv { namespace cv_hal_rvv {
-
-#undef cv_hal_exp32f
-#define cv_hal_exp32f cv::cv_hal_rvv::exp32f
-#undef cv_hal_exp64f
-#define cv_hal_exp64f cv::cv_hal_rvv::exp64f
+#if CV_HAL_RVV_1P0_ENABLED
 
 namespace detail {
 
@@ -116,7 +110,7 @@ static constexpr double exp_tab_64f[exp_tab_size] = EXP_TAB_VALUE;
 
 }  // namespace detail
 
-inline int exp32f(const float* src, float* dst, int _len)
+int exp32f(const float* src, float* dst, int _len)
 {
     size_t vl = __riscv_vsetvlmax_e32m4();
     auto exp_a2 = __riscv_vfmv_v_f_f32m4(detail::exp32f_a2, vl);
@@ -158,7 +152,7 @@ inline int exp32f(const float* src, float* dst, int _len)
     return CV_HAL_ERROR_OK;
 }
 
-inline int exp64f(const double* src, double* dst, int _len)
+int exp64f(const double* src, double* dst, int _len)
 {
     size_t vl = __riscv_vsetvlmax_e64m4();
     // all vector registers are used up, so not load more constants
@@ -203,6 +197,6 @@ inline int exp64f(const double* src, double* dst, int _len)
     return CV_HAL_ERROR_OK;
 }
 
-}}  // namespace cv::cv_hal_rvv
+#endif // CV_HAL_RVV_1P0_ENABLED
 
-#endif //OPENCV_HAL_RVV_EXP_HPP_INCLUDED
+}}}  // cv::rvv_hal::core
diff --git a/hal/riscv-rvv/hal_rvv_1p0/flip.hpp b/hal/riscv-rvv/src/core/flip.cpp
similarity index 95%
rename from hal/riscv-rvv/hal_rvv_1p0/flip.hpp
rename to hal/riscv-rvv/src/core/flip.cpp
index 02abeb6e93..42f7c8b16d 100644
--- a/hal/riscv-rvv/hal_rvv_1p0/flip.hpp
+++ b/hal/riscv-rvv/src/core/flip.cpp
@@ -5,13 +5,7 @@
 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
 // Copyright (C) 2025, SpaceMIT Inc., all rights reserved.
 
-#ifndef OPENCV_HAL_RVV_FLIP_HPP_INCLUDED
-#define OPENCV_HAL_RVV_FLIP_HPP_INCLUDED
-
-
-#include <riscv_vector.h>
-#include <opencv2/core/base.hpp>
-#include "hal_rvv_1p0/types.hpp"
+#include "rvv_hal.hpp"
 
 #if defined (__clang__) && __clang_major__ < 18
 #define OPENCV_HAL_IMPL_RVV_VCREATE_x3(suffix, width, v0, v1, v2) \
@@ -24,10 +18,9 @@
 #define __riscv_vcreate_v_u64m2x3(v0, v1, v2) OPENCV_HAL_IMPL_RVV_VCREATE_x3(u64, 2, v0, v1, v2)
 #endif
 
-namespace cv { namespace cv_hal_rvv {
+namespace cv { namespace rvv_hal { namespace core {
 
-#undef cv_hal_flip
-#define cv_hal_flip cv::cv_hal_rvv::flip
+#if CV_HAL_RVV_1P0_ENABLED
 
 namespace {
 
@@ -73,6 +66,13 @@ CV_HAL_RVV_FLIP_INPLACE_C1(16UC1, ushort, RVV_U16M8)
 CV_HAL_RVV_FLIP_INPLACE_C1(32UC1, unsigned, RVV_U32M8)
 CV_HAL_RVV_FLIP_INPLACE_C1(64UC1, uint64_t, RVV_U64M8)
 
+// Suppress warnings of "ignoring attributes applied to VecType after definition",
+// VecType is vuint8m2x3_t, vuint16m2x3_t, vuint32m2x3_t or vuint64m2x3_t
+#if defined (__GNUC__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wattributes"
+#endif
+
 #define CV_HAL_RVV_FLIP_C3_TYPES(width) \
 struct RVV_C3_U##width##M2 : RVV_U##width##M2 { \
     static inline vuint##width##m2x3_t vload3(const uint##width##_t *base, size_t vl) { return __riscv_vlseg3e##width##_v_u##width##m2x3(base, vl); } \
@@ -90,6 +90,10 @@ CV_HAL_RVV_FLIP_C3_TYPES(16)
 CV_HAL_RVV_FLIP_C3_TYPES(32)
 CV_HAL_RVV_FLIP_C3_TYPES(64)
 
+#if defined (__GNUC__)
+#pragma GCC diagnostic pop
+#endif
+
 #define CV_HAL_RVV_FLIP_C3(name, _Tps, RVV) \
 inline void flip_##name(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int src_width, int src_height, int flip_mode) { \
     for (int h = 0; h < src_height; h++) { \
@@ -302,7 +306,7 @@ inline int flip_inplace(int esz, uchar* data, size_t step, int width, int height
         0, 0, 0, 0,
         0
     };
-    FlipInplaceFunc func = flip_inplace_func_tab[esz];
+    FlipInplaceFunc func = esz <= 32 ? flip_inplace_func_tab[esz] : nullptr;
     if (!func) {
         return CV_HAL_ERROR_NOT_IMPLEMENTED;
     }
@@ -311,7 +315,7 @@ inline int flip_inplace(int esz, uchar* data, size_t step, int width, int height
     return CV_HAL_ERROR_OK;
 }
 
-inline int flip(int src_type, const uchar* src_data, size_t src_step, int src_width, int src_height,
+int flip(int src_type, const uchar* src_data, size_t src_step, int src_width, int src_height,
                 uchar* dst_data, size_t dst_step, int flip_mode)
 {
     int esz = CV_ELEM_SIZE(src_type);
@@ -344,7 +348,7 @@ inline int flip(int src_type, const uchar* src_data, size_t src_step, int src_wi
         0, 0, 0, 0,
         0
     };
-    FlipFunc func = flip_func_tab[esz];
+    FlipFunc func = esz <= 32 ? flip_func_tab[esz] : nullptr;
     if (func) {
         func(src_data, src_step, dst_data, dst_step, src_width, src_height, flip_mode);
         return CV_HAL_ERROR_OK;
@@ -368,6 +372,6 @@ inline int flip(int src_type, const uchar* src_data, size_t src_step, int src_wi
     return CV_HAL_ERROR_OK;
 }
 
-}}  // namespace cv::cv_hal_rvv
+#endif // CV_HAL_RVV_1P0_ENABLED
 
-#endif //OPENCV_HAL_RVV_FLIP_HPP_INCLUDED
+}}}  // cv::rvv_hal::core
diff --git a/hal/riscv-rvv/hal_rvv_1p0/log.hpp b/hal/riscv-rvv/src/core/log.cpp
similarity index 98%
rename from hal/riscv-rvv/hal_rvv_1p0/log.hpp
rename to hal/riscv-rvv/src/core/log.cpp
index 8df0761861..0783e3be54 100644
--- a/hal/riscv-rvv/hal_rvv_1p0/log.hpp
+++ b/hal/riscv-rvv/src/core/log.cpp
@@ -4,17 +4,11 @@
 
 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
 
-#ifndef OPENCV_HAL_RVV_LOG_HPP_INCLUDED
-#define OPENCV_HAL_RVV_LOG_HPP_INCLUDED
+#include "rvv_hal.hpp"
 
-#include <riscv_vector.h>
+namespace cv { namespace rvv_hal { namespace core {
 
-namespace cv { namespace cv_hal_rvv {
-
-#undef cv_hal_log32f
-#define cv_hal_log32f cv::cv_hal_rvv::log32f
-#undef cv_hal_log64f
-#define cv_hal_log64f cv::cv_hal_rvv::log64f
+#if CV_HAL_RVV_1P0_ENABLED
 
 namespace detail {
 
@@ -306,7 +300,7 @@ static constexpr double log_tab_64f[log_tab_size] = LOG_TAB_VALUE;
 
 }  // namespace detail
 
-inline int log32f(const float* src, float* dst, int _len)
+int log32f(const float* src, float* dst, int _len)
 {
     size_t vl = __riscv_vsetvlmax_e32m4();
     auto log_a2 = __riscv_vfmv_v_f_f32m4(detail::log32f_a2, vl);
@@ -340,7 +334,7 @@ inline int log32f(const float* src, float* dst, int _len)
     return CV_HAL_ERROR_OK;
 }
 
-inline int log64f(const double* src, double* dst, int _len)
+int log64f(const double* src, double* dst, int _len)
 {
     size_t vl = __riscv_vsetvlmax_e64m4();
     // all vector registers are used up, so not load more constants
@@ -382,6 +376,6 @@ inline int log64f(const double* src, double* dst, int _len)
     return CV_HAL_ERROR_OK;
 }
 
-}}  // namespace cv::cv_hal_rvv
+#endif // CV_HAL_RVV_1P0_ENABLED
 
-#endif //OPENCV_HAL_RVV_LOG_HPP_INCLUDED
+}}}  // cv::rvv_hal::core
diff --git a/hal/riscv-rvv/hal_rvv_1p0/lu.hpp b/hal/riscv-rvv/src/core/lu.cpp
similarity index 91%
rename from hal/riscv-rvv/hal_rvv_1p0/lu.hpp
rename to hal/riscv-rvv/src/core/lu.cpp
index 6de137fe82..d4579caa47 100644
--- a/hal/riscv-rvv/hal_rvv_1p0/lu.hpp
+++ b/hal/riscv-rvv/src/core/lu.cpp
@@ -4,21 +4,16 @@
 
 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
 
-#ifndef OPENCV_HAL_RVV_LU_HPP_INCLUDED
-#define OPENCV_HAL_RVV_LU_HPP_INCLUDED
-
+#include "rvv_hal.hpp"
 #include <cfloat>
 #include <cmath>
 #include <typeinfo>
-#include <riscv_vector.h>
-#include "hal_rvv_1p0/types.hpp"
 
-namespace cv { namespace cv_hal_rvv { namespace lu {
+namespace cv { namespace rvv_hal { namespace core {
 
-#undef cv_hal_LU32f
-#define cv_hal_LU32f cv::cv_hal_rvv::lu::LU<cv::cv_hal_rvv::RVV_F32M4>
-#undef cv_hal_LU64f
-#define cv_hal_LU64f cv::cv_hal_rvv::lu::LU<cv::cv_hal_rvv::RVV_F64M4>
+#if CV_HAL_RVV_1P0_ENABLED
+
+namespace {
 
 // the algorithm is copied from core/src/matrix_decomp.cpp,
 // in the function template static int cv::LUImpl
@@ -167,6 +162,15 @@ inline int LU(T* src1, size_t src1_step, int m, T* src2, size_t src2_step, int n
     return CV_HAL_ERROR_OK;
 }
 
-}}}
+} // anonymous
 
-#endif
+int LU32f(float* src1, size_t src1_step, int m, float* src2, size_t src2_step, int n, int* info) {
+    return LU<RVV_F32M4>(src1, src1_step, m, src2, src2_step, n, info);
+}
+int LU64f(double* src1, size_t src1_step, int m, double* src2, size_t src2_step, int n, int* info) {
+    return LU<RVV_F64M4>(src1, src1_step, m, src2, src2_step, n, info);
+}
+
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+}}} // cv::rvv_hal::core
diff --git a/hal/riscv-rvv/hal_rvv_1p0/lut.hpp b/hal/riscv-rvv/src/core/lut.cpp
similarity index 93%
rename from hal/riscv-rvv/hal_rvv_1p0/lut.hpp
rename to hal/riscv-rvv/src/core/lut.cpp
index c13a5b2f0a..a90afd2604 100644
--- a/hal/riscv-rvv/hal_rvv_1p0/lut.hpp
+++ b/hal/riscv-rvv/src/core/lut.cpp
@@ -4,19 +4,11 @@
 
 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
 
-#ifndef OPENCV_HAL_RVV_LUT_HPP_INCLUDED
-#define OPENCV_HAL_RVV_LUT_HPP_INCLUDED
+#include "rvv_hal.hpp"
 
-#include <riscv_vector.h>
-#include <opencv2/core/base.hpp>
-#include <opencv2/core/utility.hpp>
+namespace cv { namespace rvv_hal { namespace core {
 
-#include "hal_rvv_1p0/types.hpp"
-
-namespace cv { namespace cv_hal_rvv {
-
-#undef cv_hal_lut
-#define cv_hal_lut cv::cv_hal_rvv::lut
+#if CV_HAL_RVV_1P0_ENABLED
 
 // need vlen >= 256
 struct LUTCacheU8 : RVV_U8M8
@@ -135,7 +127,7 @@ private:
     LUTParallelBody& operator=(const LUTParallelBody&);
 };
 
-inline int lut(const uchar* src_data,
+int lut(const uchar* src_data,
                size_t src_step,
                size_t src_type,
                const uchar* lut_data,
@@ -191,6 +183,6 @@ inline int lut(const uchar* src_data,
     return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
 
-}}  // namespace cv::cv_hal_rvv
+#endif // CV_HAL_RVV_1P0_ENABLED
 
-#endif //OPENCV_HAL_RVV_LUT_HPP_INCLUDED
+}}}  // cv::rvv_hal::core
diff --git a/hal/riscv-rvv/hal_rvv_1p0/magnitude.hpp b/hal/riscv-rvv/src/core/magnitude.cpp
similarity index 54%
rename from hal/riscv-rvv/hal_rvv_1p0/magnitude.hpp
rename to hal/riscv-rvv/src/core/magnitude.cpp
index eb814c1b77..8630b717da 100644
--- a/hal/riscv-rvv/hal_rvv_1p0/magnitude.hpp
+++ b/hal/riscv-rvv/src/core/magnitude.cpp
@@ -4,20 +4,14 @@
 
 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
 
-#ifndef OPENCV_HAL_RVV_MAGNITUDE_HPP_INCLUDED
-#define OPENCV_HAL_RVV_MAGNITUDE_HPP_INCLUDED
+#include "rvv_hal.hpp"
+#include "common.hpp"
 
-#include <riscv_vector.h>
+namespace cv { namespace rvv_hal { namespace core {
 
-#include "hal_rvv_1p0/sqrt.hpp"
-#include "hal_rvv_1p0/types.hpp"
+#if CV_HAL_RVV_1P0_ENABLED
 
-namespace cv { namespace cv_hal_rvv {
-
-#undef cv_hal_magnitude32f
-#define cv_hal_magnitude32f cv::cv_hal_rvv::magnitude<cv::cv_hal_rvv::Sqrt32f<cv::cv_hal_rvv::RVV_F32M8>>
-#undef cv_hal_magnitude64f
-#define cv_hal_magnitude64f cv::cv_hal_rvv::magnitude<cv::cv_hal_rvv::Sqrt64f<cv::cv_hal_rvv::RVV_F64M8>>
+namespace {
 
 template <typename SQRT_T, typename T = typename SQRT_T::T::ElemType>
 inline int magnitude(const T* x, const T* y, T* dst, int len)
@@ -30,13 +24,22 @@ inline int magnitude(const T* x, const T* y, T* dst, int len)
         auto vx = SQRT_T::T::vload(x, vl);
         auto vy = SQRT_T::T::vload(y, vl);
 
-        auto vmag = detail::sqrt<SQRT_T::iter_times>(__riscv_vfmadd(vx, vx, __riscv_vfmul(vy, vy, vl), vl), vl);
+        auto vmag = common::sqrt<SQRT_T::iter_times>(__riscv_vfmadd(vx, vx, __riscv_vfmul(vy, vy, vl), vl), vl);
         SQRT_T::T::vstore(dst, vmag, vl);
     }
 
     return CV_HAL_ERROR_OK;
 }
 
-}}  // namespace cv::cv_hal_rvv
+} // anonymous
 
-#endif  // OPENCV_HAL_RVV_MAGNITUDE_HPP_INCLUDED
+int magnitude32f(const float *x, const float *y, float *dst, int len) {
+    return magnitude<common::Sqrt32f<RVV_F32M8>>(x, y, dst, len);
+}
+int magnitude64f(const double *x, const double  *y, double *dst, int len) {
+    return magnitude<common::Sqrt64f<RVV_F64M8>>(x, y, dst, len);
+}
+
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+}}}  // cv::rvv_hal::core
diff --git a/hal/riscv-rvv/hal_rvv_1p0/mean.hpp b/hal/riscv-rvv/src/core/mean.cpp
similarity index 95%
rename from hal/riscv-rvv/hal_rvv_1p0/mean.hpp
rename to hal/riscv-rvv/src/core/mean.cpp
index e8156371b3..2fc2f98f65 100644
--- a/hal/riscv-rvv/hal_rvv_1p0/mean.hpp
+++ b/hal/riscv-rvv/src/core/mean.cpp
@@ -4,15 +4,11 @@
 
 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
 
-#ifndef OPENCV_HAL_RVV_MEANSTDDEV_HPP_INCLUDED
-#define OPENCV_HAL_RVV_MEANSTDDEV_HPP_INCLUDED
+#include "rvv_hal.hpp"
 
-#include <riscv_vector.h>
+namespace cv { namespace rvv_hal { namespace core {
 
-namespace cv { namespace cv_hal_rvv {
-
-#undef cv_hal_meanStdDev
-#define cv_hal_meanStdDev cv::cv_hal_rvv::meanStdDev
+#if CV_HAL_RVV_1P0_ENABLED
 
 inline int meanStdDev_8UC1(const uchar* src_data, size_t src_step, int width, int height,
                             double* mean_val, double* stddev_val, uchar* mask, size_t mask_step);
@@ -21,8 +17,8 @@ inline int meanStdDev_8UC4(const uchar* src_data, size_t src_step, int width, in
 inline int meanStdDev_32FC1(const uchar* src_data, size_t src_step, int width, int height,
                             double* mean_val, double* stddev_val, uchar* mask, size_t mask_step);
 
-inline int meanStdDev(const uchar* src_data, size_t src_step, int width, int height,
-                             int src_type, double* mean_val, double* stddev_val, uchar* mask, size_t mask_step) {
+int meanStdDev(const uchar* src_data, size_t src_step, int width, int height, int src_type,
+               double* mean_val, double* stddev_val, uchar* mask, size_t mask_step) {
     switch (src_type)
     {
     case CV_8UC1:
@@ -226,6 +222,6 @@ inline int meanStdDev_32FC1(const uchar* src_data, size_t src_step, int width, i
     return CV_HAL_ERROR_OK;
 }
 
-}}
+#endif // CV_HAL_RVV_1P0_ENABLED
 
-#endif
+}}} // cv::rvv_hal::core
diff --git a/hal/riscv-rvv/hal_rvv_1p0/merge.hpp b/hal/riscv-rvv/src/core/merge.cpp
similarity index 93%
rename from hal/riscv-rvv/hal_rvv_1p0/merge.hpp
rename to hal/riscv-rvv/src/core/merge.cpp
index b1da204b39..9dcc6b67e2 100644
--- a/hal/riscv-rvv/hal_rvv_1p0/merge.hpp
+++ b/hal/riscv-rvv/src/core/merge.cpp
@@ -4,21 +4,7 @@
 
 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
 
-#ifndef OPENCV_HAL_RVV_MERGE_HPP_INCLUDED
-#define OPENCV_HAL_RVV_MERGE_HPP_INCLUDED
-
-#include <riscv_vector.h>
-
-namespace cv { namespace cv_hal_rvv {
-
-#undef cv_hal_merge8u
-#define cv_hal_merge8u cv::cv_hal_rvv::merge8u
-#undef cv_hal_merge16u
-#define cv_hal_merge16u cv::cv_hal_rvv::merge16u
-#undef cv_hal_merge32s
-#define cv_hal_merge32s cv::cv_hal_rvv::merge32s
-#undef cv_hal_merge64s
-#define cv_hal_merge64s cv::cv_hal_rvv::merge64s
+#include "rvv_hal.hpp"
 
 #if defined __clang__ && __clang_major__ < 18
 #define OPENCV_HAL_IMPL_RVV_VCREATE_x2(suffix, width, v0, v1) \
@@ -44,7 +30,11 @@ namespace cv { namespace cv_hal_rvv {
 #define __riscv_vcreate_v_u16m2x4(v0, v1, v2, v3) OPENCV_HAL_IMPL_RVV_VCREATE_x4(u16, 2, v0, v1, v2, v3)
 #endif  // clang < 18
 
-inline int merge8u(const uchar** src, uchar* dst, int len, int cn ) {
+namespace cv { namespace rvv_hal { namespace core {
+
+#if CV_HAL_RVV_1P0_ENABLED
+
+int merge8u(const uchar** src, uchar* dst, int len, int cn ) {
     int vl = 0;
     if (cn == 1)
     {
@@ -129,7 +119,7 @@ inline int merge8u(const uchar** src, uchar* dst, int len, int cn ) {
     return CV_HAL_ERROR_OK;
 }
 
-inline int merge16u(const ushort** src, ushort* dst, int len, int cn ) {
+int merge16u(const ushort** src, ushort* dst, int len, int cn ) {
     int vl = 0;
     if (cn == 1)
     {
@@ -217,7 +207,7 @@ inline int merge16u(const ushort** src, ushort* dst, int len, int cn ) {
 #if defined __GNUC__ && !defined(__clang__)
 __attribute__((optimize("no-tree-vectorize")))
 #endif
-inline int merge32s(const int** src, int* dst, int len, int cn ) {
+int merge32s(const int** src, int* dst, int len, int cn ) {
     int k = cn % 4 ? cn % 4 : 4;
     int i, j;
     if( k == 1 )
@@ -287,7 +277,7 @@ inline int merge32s(const int** src, int* dst, int len, int cn ) {
 #if defined __GNUC__ && !defined(__clang__)
 __attribute__((optimize("no-tree-vectorize")))
 #endif
-inline int merge64s(const int64** src, int64* dst, int len, int cn ) {
+int merge64s(const int64** src, int64* dst, int len, int cn ) {
     int k = cn % 4 ? cn % 4 : 4;
     int i, j;
     if( k == 1 )
@@ -354,6 +344,6 @@ inline int merge64s(const int64** src, int64* dst, int len, int cn ) {
     return CV_HAL_ERROR_OK;
 }
 
-}}
+#endif // CV_HAL_RVV_1P0_ENABLED
 
-#endif
+}}} // cv::rvv_hal::core
diff --git a/hal/riscv-rvv/hal_rvv_1p0/minmax.hpp b/hal/riscv-rvv/src/core/minmax.cpp
similarity index 94%
rename from hal/riscv-rvv/hal_rvv_1p0/minmax.hpp
rename to hal/riscv-rvv/src/core/minmax.cpp
index ad551a430e..695cfffd52 100644
--- a/hal/riscv-rvv/hal_rvv_1p0/minmax.hpp
+++ b/hal/riscv-rvv/src/core/minmax.cpp
@@ -4,19 +4,11 @@
 
 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
 
-#ifndef OPENCV_HAL_RVV_MINMAX_HPP_INCLUDED
-#define OPENCV_HAL_RVV_MINMAX_HPP_INCLUDED
+#include "rvv_hal.hpp"
 
-#include <riscv_vector.h>
-#include <opencv2/core/base.hpp>
-#include "hal_rvv_1p0/types.hpp"
+namespace cv { namespace rvv_hal { namespace core {
 
-namespace cv { namespace cv_hal_rvv { namespace minmax {
-
-#undef cv_hal_minMaxIdx
-#define cv_hal_minMaxIdx cv::cv_hal_rvv::minmax::minMaxIdx
-#undef cv_hal_minMaxIdxMaskStep
-#define cv_hal_minMaxIdxMaskStep cv::cv_hal_rvv::minmax::minMaxIdx
+#if CV_HAL_RVV_1P0_ENABLED
 
 template<typename VEC_T, typename BOOL_T, typename T = typename VEC_T::ElemType>
 inline int minMaxIdxReadTwice(const uchar* src_data, size_t src_step, int width, int height, double* minVal, double* maxVal,
@@ -269,8 +261,8 @@ inline int minMaxIdxReadOnce(const uchar* src_data, size_t src_step, int width,
     return CV_HAL_ERROR_OK;
 }
 
-inline int minMaxIdx(const uchar* src_data, size_t src_step, int width, int height, int depth, double* minVal, double* maxVal,
-                     int* minIdx, int* maxIdx, uchar* mask, size_t mask_step = 0)
+int minMaxIdx(const uchar* src_data, size_t src_step, int width, int height, int depth,
+              double* minVal, double* maxVal, int* minIdx, int* maxIdx, uchar* mask, size_t mask_step)
 {
     if (!mask_step)
         mask_step = src_step;
@@ -296,6 +288,6 @@ inline int minMaxIdx(const uchar* src_data, size_t src_step, int width, int heig
     return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
 
-}}}
+#endif // CV_HAL_RVV_1P0_ENABLED
 
-#endif
+}}} // cv::rvv_hal::core
diff --git a/hal/riscv-rvv/hal_rvv_1p0/norm.hpp b/hal/riscv-rvv/src/core/norm.cpp
similarity index 95%
rename from hal/riscv-rvv/hal_rvv_1p0/norm.hpp
rename to hal/riscv-rvv/src/core/norm.cpp
index 00062a6fab..5f77b9d0d9 100644
--- a/hal/riscv-rvv/hal_rvv_1p0/norm.hpp
+++ b/hal/riscv-rvv/src/core/norm.cpp
@@ -6,15 +6,12 @@
 // Copyright (C) 2025, SpaceMIT Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 
-#ifndef OPENCV_HAL_RVV_NORM_HPP_INCLUDED
-#define OPENCV_HAL_RVV_NORM_HPP_INCLUDED
-
+#include "rvv_hal.hpp"
 #include "common.hpp"
 
-namespace cv { namespace cv_hal_rvv { namespace norm {
+namespace cv { namespace rvv_hal { namespace core {
 
-#undef cv_hal_norm
-#define cv_hal_norm cv::cv_hal_rvv::norm::norm
+#if CV_HAL_RVV_1P0_ENABLED
 
 namespace {
 
@@ -76,7 +73,7 @@ struct NormInf_RVV<schar, int> {
         for (int i = 0; i < n; i += vl) {
             vl = __riscv_vsetvl_e8m8(n - i);
             auto v = __riscv_vle8_v_i8m8(src + i, vl);
-            s = __riscv_vmaxu_tu(s, s, custom_intrin::__riscv_vabs(v, vl), vl);
+            s = __riscv_vmaxu_tu(s, s, common::__riscv_vabs(v, vl), vl);
         }
         return __riscv_vmv_x(__riscv_vredmaxu(s, __riscv_vmv_s_x_u8m1(0, __riscv_vsetvlmax_e8m1()), vlmax));
     }
@@ -106,7 +103,7 @@ struct NormInf_RVV<short, int> {
         for (int i = 0; i < n; i += vl) {
             vl = __riscv_vsetvl_e16m8(n - i);
             auto v = __riscv_vle16_v_i16m8(src + i, vl);
-            s = __riscv_vmaxu_tu(s, s, custom_intrin::__riscv_vabs(v, vl), vl);
+            s = __riscv_vmaxu_tu(s, s, common::__riscv_vabs(v, vl), vl);
         }
         return __riscv_vmv_x(__riscv_vredmaxu(s, __riscv_vmv_s_x_u16m1(0, __riscv_vsetvlmax_e16m1()), vlmax));
     }
@@ -121,7 +118,7 @@ struct NormInf_RVV<int, int> {
         for (int i = 0; i < n; i += vl) {
             vl = __riscv_vsetvl_e32m8(n - i);
             auto v = __riscv_vle32_v_i32m8(src + i, vl);
-            s = __riscv_vmaxu_tu(s, s, custom_intrin::__riscv_vabs(v, vl), vl);
+            s = __riscv_vmaxu_tu(s, s, common::__riscv_vabs(v, vl), vl);
         }
         return __riscv_vmv_x(__riscv_vredmaxu(s, __riscv_vmv_s_x_u32m1(0, __riscv_vsetvlmax_e32m1()), vlmax));
     }
@@ -180,7 +177,7 @@ struct NormL1_RVV<schar, int> {
         int vl;
         for (int i = 0; i < n; i += vl) {
             vl = __riscv_vsetvl_e8m8(n - i);
-            auto v = custom_intrin::__riscv_vabs(__riscv_vle8_v_i8m8(src + i, vl), vl);
+            auto v = common::__riscv_vabs(__riscv_vle8_v_i8m8(src + i, vl), vl);
             s = __riscv_vwredsumu(__riscv_vwredsumu_tu(zero, v, zero, vl), s, __riscv_vsetvlmax_e16m1());
         }
         return __riscv_vmv_x(s);
@@ -208,7 +205,7 @@ struct NormL1_RVV<short, int> {
         int vl;
         for (int i = 0; i < n; i += vl) {
             vl = __riscv_vsetvl_e16m8(n - i);
-            auto v = custom_intrin::__riscv_vabs(__riscv_vle16_v_i16m8(src + i, vl), vl);
+            auto v = common::__riscv_vabs(__riscv_vle16_v_i16m8(src + i, vl), vl);
             s = __riscv_vwredsumu(v, s, vl);
         }
         return __riscv_vmv_x(s);
@@ -223,7 +220,7 @@ struct NormL1_RVV<int, double> {
         int vl;
         for (int i = 0; i < n; i += vl) {
             vl = __riscv_vsetvl_e32m4(n - i);
-            auto v = custom_intrin::__riscv_vabs(__riscv_vle32_v_i32m4(src + i, vl), vl);
+            auto v = common::__riscv_vabs(__riscv_vle32_v_i32m4(src + i, vl), vl);
             s = __riscv_vfadd_tu(s, s, __riscv_vfwcvt_f(v, vl), vl);
         }
         return __riscv_vfmv_f(__riscv_vfredosum(s, __riscv_vfmv_s_f_f64m1(0, __riscv_vsetvlmax_e64m1()), vlmax));
@@ -544,7 +541,7 @@ struct MaskedNormInf_RVV<schar, int> {
                 auto v = __riscv_vlse8_v_i8m8(src + cn * i + cn_index, sizeof(schar) * cn, vl);
                 auto m = __riscv_vle8_v_u8m8(mask + i, vl);
                 auto b = __riscv_vmsne(m, 0, vl);
-                s = __riscv_vmaxu_tumu(b, s, s, custom_intrin::__riscv_vabs(v, vl), vl);
+                s = __riscv_vmaxu_tumu(b, s, s, common::__riscv_vabs(v, vl), vl);
             }
         }
         return __riscv_vmv_x(__riscv_vredmaxu(s, __riscv_vmv_s_x_u8m1(0, __riscv_vsetvlmax_e8m1()), vlmax));
@@ -560,7 +557,7 @@ struct MaskedNormL1_RVV<schar, int> {
             int vl;
             for (int i = 0; i < len; i += vl) {
                 vl = __riscv_vsetvl_e8m8(len - i);
-                auto v = custom_intrin::__riscv_vabs(__riscv_vlse8_v_i8m8(src + cn * i + cn_index, sizeof(schar) * cn, vl), vl);
+                auto v = common::__riscv_vabs(__riscv_vlse8_v_i8m8(src + cn * i + cn_index, sizeof(schar) * cn, vl), vl);
                 auto m = __riscv_vle8_v_u8m8(mask + i, vl);
                 auto b = __riscv_vmsne(m, 0, vl);
                 s = __riscv_vwredsumu(__riscv_vwredsumu_tum(b, zero, v, zero, vl), s, __riscv_vsetvlmax_e16m1());
@@ -657,7 +654,7 @@ struct MaskedNormInf_RVV<short, int> {
                 auto v = __riscv_vlse16_v_i16m8(src + cn * i + cn_index, sizeof(short) * cn, vl);
                 auto m = __riscv_vle8_v_u8m4(mask + i, vl);
                 auto b = __riscv_vmsne(m, 0, vl);
-                s = __riscv_vmaxu_tumu(b, s, s, custom_intrin::__riscv_vabs(v, vl), vl);
+                s = __riscv_vmaxu_tumu(b, s, s, common::__riscv_vabs(v, vl), vl);
             }
         }
         return __riscv_vmv_x(__riscv_vredmaxu(s, __riscv_vmv_s_x_u16m1(0, __riscv_vsetvlmax_e16m1()), vlmax));
@@ -672,7 +669,7 @@ struct MaskedNormL1_RVV<short, int> {
             int vl;
             for (int i = 0; i < len; i += vl) {
                 vl = __riscv_vsetvl_e8m4(len - i);
-                auto v = custom_intrin::__riscv_vabs(__riscv_vlse16_v_i16m8(src + cn * i + cn_index, sizeof(short) * cn, vl), vl);
+                auto v = common::__riscv_vabs(__riscv_vlse16_v_i16m8(src + cn * i + cn_index, sizeof(short) * cn, vl), vl);
                 auto m = __riscv_vle8_v_u8m4(mask + i, vl);
                 auto b = __riscv_vmsne(m, 0, vl);
                 s = __riscv_vwredsumu_tum(b, s, v, s, vl);
@@ -714,7 +711,7 @@ struct MaskedNormInf_RVV<int, int> {
                 auto v = __riscv_vlse32_v_i32m8(src + cn * i + cn_index, sizeof(int) * cn, vl);
                 auto m = __riscv_vle8_v_u8m2(mask + i, vl);
                 auto b = __riscv_vmsne(m, 0, vl);
-                s = __riscv_vmaxu_tumu(b, s, s, custom_intrin::__riscv_vabs(v, vl), vl);
+                s = __riscv_vmaxu_tumu(b, s, s, common::__riscv_vabs(v, vl), vl);
             }
         }
         return __riscv_vmv_x(__riscv_vredmaxu(s, __riscv_vmv_s_x_u32m1(0, __riscv_vsetvlmax_e32m1()), vlmax));
@@ -733,7 +730,7 @@ struct MaskedNormL1_RVV<int, double> {
                 auto v = __riscv_vlse32_v_i32m4(src + cn * i + cn_index, sizeof(int) * cn, vl);
                 auto m = __riscv_vle8_v_u8m1(mask + i, vl);
                 auto b = __riscv_vmsne(m, 0, vl);
-                s = __riscv_vfadd_tumu(b, s, s, __riscv_vfwcvt_f(b, custom_intrin::__riscv_vabs(v, vl), vl), vl);
+                s = __riscv_vfadd_tumu(b, s, s, __riscv_vfwcvt_f(b, common::__riscv_vabs(v, vl), vl), vl);
             }
         }
         return __riscv_vfmv_f(__riscv_vfredosum(s, __riscv_vfmv_s_f_f64m1(0, __riscv_vsetvlmax_e64m1()), vlmax));
@@ -972,8 +969,8 @@ CV_HAL_RVV_DEF_NORM_ALL(64f, double, double, double, double)
 }
 
 using NormFunc = int (*)(const uchar*, const uchar*, uchar*, int, int);
-inline int norm(const uchar* src, size_t src_step, const uchar* mask, size_t mask_step, int width,
-                int height, int type, int norm_type, double* result) {
+int norm(const uchar* src, size_t src_step, const uchar* mask, size_t mask_step,
+         int width, int height, int type, int norm_type, double* result) {
     int depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
 
     if (result == nullptr || depth == CV_16F || norm_type > NORM_L2SQR) {
@@ -1004,18 +1001,8 @@ inline int norm(const uchar* src, size_t src_step, const uchar* mask, size_t mas
         },
     };
 
-    static const size_t elem_size_tab[CV_DEPTH_MAX] = {
-        sizeof(uchar),      sizeof(schar),
-        sizeof(ushort),     sizeof(short),
-        sizeof(int),        sizeof(float),
-        sizeof(double),     sizeof(cv::hfloat),
-        sizeof(cv::bfloat), sizeof(bool),
-        sizeof(uint64_t),   sizeof(int64_t),
-        sizeof(unsigned),   0,
-    };
-    CV_Assert(elem_size_tab[depth]);
-
-    bool src_continuous = (src_step == width * elem_size_tab[depth] * cn || (src_step != width * elem_size_tab[depth] * cn && height == 1));
+    size_t elem_size1 = static_cast<size_t>(CV_ELEM_SIZE1(type));
+    bool src_continuous = (src_step == width * elem_size1 * cn || (src_step != width * elem_size1 * cn && height == 1));
     bool mask_continuous = (mask_step == static_cast<size_t>(width));
     size_t nplanes = 1;
     size_t size = width * height;
@@ -1038,7 +1025,7 @@ inline int norm(const uchar* src, size_t src_step, const uchar* mask, size_t mas
     res.d = 0;
     if ((norm_type == NORM_L1 && depth <= CV_16S) ||
         ((norm_type == NORM_L2 || norm_type == NORM_L2SQR) && depth <= CV_8S)) {
-        const size_t esz = elem_size_tab[depth] * cn;
+        const size_t esz = elem_size1 * cn;
         const int total = (int)size;
         const int intSumBlockSize = (norm_type == NORM_L1 && depth <= CV_8S ? (1 << 23) : (1 << 15))/cn;
         const int blockSize = std::min(total, intSumBlockSize);
@@ -1095,6 +1082,6 @@ inline int norm(const uchar* src, size_t src_step, const uchar* mask, size_t mas
     return CV_HAL_ERROR_OK;
 }
 
-}}} // cv::cv_hal_rvv::norm
+#endif // CV_HAL_RVV_1P0_ENABLED
 
-#endif
+}}} // cv::rvv_hal::core
diff --git a/hal/riscv-rvv/hal_rvv_1p0/norm_diff.hpp b/hal/riscv-rvv/src/core/norm_diff.cpp
similarity index 92%
rename from hal/riscv-rvv/hal_rvv_1p0/norm_diff.hpp
rename to hal/riscv-rvv/src/core/norm_diff.cpp
index c116298072..c2d388e15d 100644
--- a/hal/riscv-rvv/hal_rvv_1p0/norm_diff.hpp
+++ b/hal/riscv-rvv/src/core/norm_diff.cpp
@@ -6,15 +6,12 @@
 // Copyright (C) 2025, SpaceMIT Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 
-#ifndef OPENCV_HAL_RVV_NORM_DIFF_HPP_INCLUDED
-#define OPENCV_HAL_RVV_NORM_DIFF_HPP_INCLUDED
-
+#include "rvv_hal.hpp"
 #include "common.hpp"
 
-namespace cv { namespace cv_hal_rvv { namespace norm_diff {
+namespace cv { namespace rvv_hal { namespace core {
 
-#undef cv_hal_normDiff
-#define cv_hal_normDiff cv::cv_hal_rvv::norm_diff::normDiff
+#if CV_HAL_RVV_1P0_ENABLED
 
 namespace {
 
@@ -64,7 +61,7 @@ struct NormDiffInf_RVV<uchar, int> {
             vl = __riscv_vsetvl_e8m8(n - i);
             auto v1 = __riscv_vle8_v_u8m8(src1 + i, vl);
             auto v2 = __riscv_vle8_v_u8m8(src2 + i, vl);
-            auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+            auto v = common::__riscv_vabd(v1, v2, vl);
             s = __riscv_vmaxu_tu(s, s, v, vl);
         }
         return __riscv_vmv_x(__riscv_vredmaxu(s, __riscv_vmv_s_x_u8m1(0, __riscv_vsetvlmax_e8m1()), vlmax));
@@ -81,7 +78,7 @@ struct NormDiffInf_RVV<schar, int> {
             vl = __riscv_vsetvl_e8m8(n - i);
             auto v1 = __riscv_vle8_v_i8m8(src1 + i, vl);
             auto v2 = __riscv_vle8_v_i8m8(src2 + i, vl);
-            auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+            auto v = common::__riscv_vabd(v1, v2, vl);
             s = __riscv_vmaxu_tu(s, s, v, vl);
         }
         return __riscv_vmv_x(__riscv_vredmaxu(s, __riscv_vmv_s_x_u8m1(0, __riscv_vsetvlmax_e8m1()), vlmax));
@@ -98,7 +95,7 @@ struct NormDiffInf_RVV<ushort, int> {
             vl = __riscv_vsetvl_e16m8(n - i);
             auto v1 = __riscv_vle16_v_u16m8(src1 + i, vl);
             auto v2 = __riscv_vle16_v_u16m8(src2 + i, vl);
-            auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+            auto v = common::__riscv_vabd(v1, v2, vl);
             s = __riscv_vmaxu_tu(s, s, v, vl);
         }
         return __riscv_vmv_x(__riscv_vredmaxu(s, __riscv_vmv_s_x_u16m1(0, __riscv_vsetvlmax_e16m1()), vlmax));
@@ -115,7 +112,7 @@ struct NormDiffInf_RVV<short, int> {
             vl = __riscv_vsetvl_e16m8(n - i);
             auto v1 = __riscv_vle16_v_i16m8(src1 + i, vl);
             auto v2 = __riscv_vle16_v_i16m8(src2 + i, vl);
-            auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+            auto v = common::__riscv_vabd(v1, v2, vl);
             s = __riscv_vmaxu_tu(s, s, v, vl);
         }
         return __riscv_vmv_x(__riscv_vredmaxu(s, __riscv_vmv_s_x_u16m1(0, __riscv_vsetvlmax_e16m1()), vlmax));
@@ -132,7 +129,8 @@ struct NormDiffInf_RVV<int, unsigned> {
             vl = __riscv_vsetvl_e32m8(n - i);
             auto v1 = __riscv_vle32_v_i32m8(src1 + i, vl);
             auto v2 = __riscv_vle32_v_i32m8(src2 + i, vl);
-            auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+            auto v = common::__riscv_vabd(v1, v2, vl); // 5.x
+            // auto v = common::__riscv_vabs(__riscv_vsub(v1, v2, vl), vl); // 4.x
             s = __riscv_vmaxu_tu(s, s, v, vl);
         }
         return __riscv_vmv_x(__riscv_vredmaxu(s, __riscv_vmv_s_x_u32m1(0, __riscv_vsetvlmax_e32m1()), vlmax));
@@ -183,7 +181,7 @@ struct NormDiffL1_RVV<uchar, int> {
             vl = __riscv_vsetvl_e8m8(n - i);
             auto v1 = __riscv_vle8_v_u8m8(src1 + i, vl);
             auto v2 = __riscv_vle8_v_u8m8(src2 + i, vl);
-            auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+            auto v = common::__riscv_vabd(v1, v2, vl);
             s = __riscv_vwredsumu(__riscv_vwredsumu_tu(zero, v, zero, vl), s, __riscv_vsetvlmax_e16m1());
         }
         return __riscv_vmv_x(s);
@@ -200,7 +198,7 @@ struct NormDiffL1_RVV<schar, int> {
             vl = __riscv_vsetvl_e8m8(n - i);
             auto v1 = __riscv_vle8_v_i8m8(src1 + i, vl);
             auto v2 = __riscv_vle8_v_i8m8(src2 + i, vl);
-            auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+            auto v = common::__riscv_vabd(v1, v2, vl);
             s = __riscv_vwredsumu(__riscv_vwredsumu_tu(zero, v, zero, vl), s, __riscv_vsetvlmax_e16m1());
         }
         return __riscv_vmv_x(s);
@@ -216,7 +214,7 @@ struct NormDiffL1_RVV<ushort, int> {
             vl = __riscv_vsetvl_e16m8(n - i);
             auto v1 = __riscv_vle16_v_u16m8(src1 + i, vl);
             auto v2 = __riscv_vle16_v_u16m8(src2 + i, vl);
-            auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+            auto v = common::__riscv_vabd(v1, v2, vl);
             s = __riscv_vwredsumu(v, s, vl);
         }
         return __riscv_vmv_x(s);
@@ -232,7 +230,7 @@ struct NormDiffL1_RVV<short, int> {
             vl = __riscv_vsetvl_e16m8(n - i);
             auto v1 = __riscv_vle16_v_i16m8(src1 + i, vl);
             auto v2 = __riscv_vle16_v_i16m8(src2 + i, vl);
-            auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+            auto v = common::__riscv_vabd(v1, v2, vl);
             s = __riscv_vwredsumu(v, s, vl);
         }
         return __riscv_vmv_x(s);
@@ -249,7 +247,8 @@ struct NormDiffL1_RVV<int, double> {
             vl = __riscv_vsetvl_e32m4(n - i);
             auto v1 = __riscv_vle32_v_i32m4(src1 + i, vl);
             auto v2 = __riscv_vle32_v_i32m4(src2 + i, vl);
-            auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+            auto v = common::__riscv_vabd(v1, v2, vl); // 5.x
+            // auto v = common::__riscv_vabs(__riscv_vsub(v1, v2, vl), vl); // 4.x
             s = __riscv_vfadd_tu(s, s, __riscv_vfwcvt_f(v, vl), vl);
         }
         return __riscv_vfmv_f(__riscv_vfredosum(s, __riscv_vfmv_s_f_f64m1(0, __riscv_vsetvlmax_e64m1()), vlmax));
@@ -299,7 +298,7 @@ struct NormDiffL2_RVV<uchar, int> {
             vl = __riscv_vsetvl_e8m4(n - i);
             auto v1 = __riscv_vle8_v_u8m4(src1 + i, vl);
             auto v2 = __riscv_vle8_v_u8m4(src2 + i, vl);
-            auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+            auto v = common::__riscv_vabd(v1, v2, vl);
             s = __riscv_vwredsumu(__riscv_vwmulu(v, v, vl), s, vl);
         }
         return __riscv_vmv_x(s);
@@ -315,7 +314,7 @@ struct NormDiffL2_RVV<schar, int> {
             vl = __riscv_vsetvl_e8m4(n - i);
             auto v1 = __riscv_vle8_v_i8m4(src1 + i, vl);
             auto v2 = __riscv_vle8_v_i8m4(src2 + i, vl);
-            auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+            auto v = common::__riscv_vabd(v1, v2, vl);
             s = __riscv_vwredsumu(__riscv_vwmulu(v, v, vl), s, vl);
         }
         return __riscv_vmv_x(s);
@@ -332,7 +331,7 @@ struct NormDiffL2_RVV<ushort, double> {
             vl = __riscv_vsetvl_e16m2(n - i);
             auto v1 = __riscv_vle16_v_u16m2(src1 + i, vl);
             auto v2 = __riscv_vle16_v_u16m2(src2 + i, vl);
-            auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+            auto v = common::__riscv_vabd(v1, v2, vl);
             auto v_mul = __riscv_vwmulu(v, v, vl);
             s = __riscv_vfadd_tu(s, s, __riscv_vfwcvt_f(v_mul, vl), vl);
         }
@@ -350,7 +349,7 @@ struct NormDiffL2_RVV<short, double> {
             vl = __riscv_vsetvl_e16m2(n - i);
             auto v1 = __riscv_vle16_v_i16m2(src1 + i, vl);
             auto v2 = __riscv_vle16_v_i16m2(src2 + i, vl);
-            auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+            auto v = common::__riscv_vabd(v1, v2, vl);
             auto v_mul = __riscv_vwmulu(v, v, vl);
             s = __riscv_vfadd_tu(s, s, __riscv_vfwcvt_f(v_mul, vl), vl);
         }
@@ -368,7 +367,7 @@ struct NormDiffL2_RVV<int, double> {
             vl = __riscv_vsetvl_e32m4(n - i);
             auto v1 = __riscv_vle32_v_i32m4(src1 + i, vl);
             auto v2 = __riscv_vle32_v_i32m4(src2 + i, vl);
-            auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+            auto v = common::__riscv_vabd(v1, v2, vl);
             auto v_mul = __riscv_vwmulu(v, v, vl);
             s = __riscv_vfadd_tu(s, s, __riscv_vfcvt_f(v_mul, vl), vl);
         }
@@ -471,7 +470,7 @@ struct MaskedNormDiffInf_RVV<uchar, int> {
                 vl = __riscv_vsetvl_e8m8(len - i);
                 auto v1 = __riscv_vle8_v_u8m8(src1 + i, vl);
                 auto v2 = __riscv_vle8_v_u8m8(src2 + i, vl);
-                auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+                auto v = common::__riscv_vabd(v1, v2, vl);
                 auto m = __riscv_vle8_v_u8m8(mask + i, vl);
                 auto b = __riscv_vmsne(m, 0, vl);
                 s = __riscv_vmaxu_tumu(b, s, s, v, vl);
@@ -482,7 +481,7 @@ struct MaskedNormDiffInf_RVV<uchar, int> {
                 vl = __riscv_vsetvl_e8m2(len - i);
                 auto v1 = __riscv_vle8_v_u8m8(src1 + i * 4, vl * 4);
                 auto v2 = __riscv_vle8_v_u8m8(src2 + i * 4, vl * 4);
-                auto v = custom_intrin::__riscv_vabd(v1, v2, vl * 4);
+                auto v = common::__riscv_vabd(v1, v2, vl * 4);
                 auto m = __riscv_vle8_v_u8m2(mask + i, vl);
                 auto b = __riscv_vmsne(__riscv_vreinterpret_u8m8(__riscv_vmul(__riscv_vzext_vf4(__riscv_vminu(m, 1, vl), vl), 0x01010101, vl)), 0, vl * 4);
                 s = __riscv_vmaxu_tumu(b, s, s, v, vl * 4);
@@ -494,7 +493,7 @@ struct MaskedNormDiffInf_RVV<uchar, int> {
                     vl = __riscv_vsetvl_e8m8(len - i);
                     auto v1 = __riscv_vlse8_v_u8m8(src1 + cn * i + cn_index, sizeof(uchar) * cn, vl);
                     auto v2 = __riscv_vlse8_v_u8m8(src2 + cn * i + cn_index, sizeof(uchar) * cn, vl);
-                    auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+                    auto v = common::__riscv_vabd(v1, v2, vl);
                     auto m = __riscv_vle8_v_u8m8(mask + i, vl);
                     auto b = __riscv_vmsne(m, 0, vl);
                     s = __riscv_vmaxu_tumu(b, s, s, v, vl);
@@ -516,7 +515,7 @@ struct MaskedNormDiffInf_RVV<schar, int> {
                 vl = __riscv_vsetvl_e8m8(len - i);
                 auto v1 = __riscv_vlse8_v_i8m8(src1 + cn * i + cn_index, sizeof(schar) * cn, vl);
                 auto v2 = __riscv_vlse8_v_i8m8(src2 + cn * i + cn_index, sizeof(schar) * cn, vl);
-                auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+                auto v = common::__riscv_vabd(v1, v2, vl);
                 auto m = __riscv_vle8_v_u8m8(mask + i, vl);
                 auto b = __riscv_vmsne(m, 0, vl);
                 s = __riscv_vmaxu_tumu(b, s, s, v, vl);
@@ -537,7 +536,7 @@ struct MaskedNormDiffInf_RVV<ushort, int> {
                 vl = __riscv_vsetvl_e16m8(len - i);
                 auto v1 = __riscv_vlse16_v_u16m8(src1 + cn * i + cn_index, sizeof(ushort) * cn, vl);
                 auto v2 = __riscv_vlse16_v_u16m8(src2 + cn * i + cn_index, sizeof(ushort) * cn, vl);
-                auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+                auto v = common::__riscv_vabd(v1, v2, vl);
                 auto m = __riscv_vle8_v_u8m4(mask + i, vl);
                 auto b = __riscv_vmsne(m, 0, vl);
                 s = __riscv_vmaxu_tumu(b, s, s, v, vl);
@@ -558,7 +557,7 @@ struct MaskedNormDiffInf_RVV<short, int> {
                 vl = __riscv_vsetvl_e16m8(len - i);
                 auto v1 = __riscv_vlse16_v_i16m8(src1 + cn * i + cn_index, sizeof(short) * cn, vl);
                 auto v2 = __riscv_vlse16_v_i16m8(src2 + cn * i + cn_index, sizeof(short) * cn, vl);
-                auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+                auto v = common::__riscv_vabd(v1, v2, vl);
                 auto m = __riscv_vle8_v_u8m4(mask + i, vl);
                 auto b = __riscv_vmsne(m, 0, vl);
                 s = __riscv_vmaxu_tumu(b, s, s, v, vl);
@@ -579,7 +578,8 @@ struct MaskedNormDiffInf_RVV<int, unsigned> {
                 vl = __riscv_vsetvl_e32m8(len - i);
                 auto v1 = __riscv_vlse32_v_i32m8(src1 + cn * i + cn_index, sizeof(int) * cn, vl);
                 auto v2 = __riscv_vlse32_v_i32m8(src2 + cn * i + cn_index, sizeof(int) * cn, vl);
-                auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+                auto v = common::__riscv_vabd(v1, v2, vl); // 5.x
+                // auto v = common::__riscv_vabs(__riscv_vsub(v1, v2, vl), vl); // 4.x
                 auto m = __riscv_vle8_v_u8m2(mask + i, vl);
                 auto b = __riscv_vmsne(m, 0, vl);
                 s = __riscv_vmaxu_tumu(b, s, s, v, vl);
@@ -655,7 +655,7 @@ struct MaskedNormDiffL1_RVV<uchar, int> {
                 vl = __riscv_vsetvl_e8m8(len - i);
                 auto v1 = __riscv_vle8_v_u8m8(src1 + i, vl);
                 auto v2 = __riscv_vle8_v_u8m8(src2 + i, vl);
-                auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+                auto v = common::__riscv_vabd(v1, v2, vl);
                 auto m = __riscv_vle8_v_u8m8(mask + i, vl);
                 auto b = __riscv_vmsne(m, 0, vl);
                 s = __riscv_vwredsumu(__riscv_vwredsumu_tum(b, zero, v, zero, vl), s, __riscv_vsetvlmax_e16m1());
@@ -666,7 +666,7 @@ struct MaskedNormDiffL1_RVV<uchar, int> {
                 vl = __riscv_vsetvl_e8m2(len - i);
                 auto v1 = __riscv_vle8_v_u8m8(src1 + i * 4, vl * 4);
                 auto v2 = __riscv_vle8_v_u8m8(src2 + i * 4, vl * 4);
-                auto v = custom_intrin::__riscv_vabd(v1, v2, vl * 4);
+                auto v = common::__riscv_vabd(v1, v2, vl * 4);
                 auto m = __riscv_vle8_v_u8m2(mask + i, vl);
                 auto b = __riscv_vmsne(__riscv_vreinterpret_u8m8(__riscv_vmul(__riscv_vzext_vf4(__riscv_vminu(m, 1, vl), vl), 0x01010101, vl)), 0, vl * 4);
                 s = __riscv_vwredsumu(__riscv_vwredsumu_tum(b, zero, v, zero, vl * 4), s, __riscv_vsetvlmax_e16m1());
@@ -678,7 +678,7 @@ struct MaskedNormDiffL1_RVV<uchar, int> {
                     vl = __riscv_vsetvl_e8m8(len - i);
                     auto v1 = __riscv_vlse8_v_u8m8(src1 + cn * i + cn_index, sizeof(uchar) * cn, vl);
                     auto v2 = __riscv_vlse8_v_u8m8(src2 + cn * i + cn_index, sizeof(uchar) * cn, vl);
-                    auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+                    auto v = common::__riscv_vabd(v1, v2, vl);
                     auto m = __riscv_vle8_v_u8m8(mask + i, vl);
                     auto b = __riscv_vmsne(m, 0, vl);
                     s = __riscv_vwredsumu(__riscv_vwredsumu_tum(b, zero, v, zero, vl), s, __riscv_vsetvlmax_e16m1());
@@ -700,7 +700,7 @@ struct MaskedNormDiffL1_RVV<schar, int> {
                 vl = __riscv_vsetvl_e8m8(len - i);
                 auto v1 = __riscv_vlse8_v_i8m8(src1 + cn * i + cn_index, sizeof(schar) * cn, vl);
                 auto v2 = __riscv_vlse8_v_i8m8(src2 + cn * i + cn_index, sizeof(schar) * cn, vl);
-                auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+                auto v = common::__riscv_vabd(v1, v2, vl);
                 auto m = __riscv_vle8_v_u8m8(mask + i, vl);
                 auto b = __riscv_vmsne(m, 0, vl);
                 s = __riscv_vwredsumu(__riscv_vwredsumu_tum(b, zero, v, zero, vl), s, __riscv_vsetvlmax_e16m1());
@@ -720,7 +720,7 @@ struct MaskedNormDiffL1_RVV<ushort, int> {
                 vl = __riscv_vsetvl_e8m4(len - i);
                 auto v1 = __riscv_vlse16_v_u16m8(src1 + cn * i + cn_index, sizeof(ushort) * cn, vl);
                 auto v2 = __riscv_vlse16_v_u16m8(src2 + cn * i + cn_index, sizeof(ushort) * cn, vl);
-                auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+                auto v = common::__riscv_vabd(v1, v2, vl);
                 auto m = __riscv_vle8_v_u8m4(mask + i, vl);
                 auto b = __riscv_vmsne(m, 0, vl);
                 s = __riscv_vwredsumu_tum(b, s, v, s, vl);
@@ -740,7 +740,7 @@ struct MaskedNormDiffL1_RVV<short, int> {
                 vl = __riscv_vsetvl_e8m4(len - i);
                 auto v1 = __riscv_vlse16_v_i16m8(src1 + cn * i + cn_index, sizeof(short) * cn, vl);
                 auto v2 = __riscv_vlse16_v_i16m8(src2 + cn * i + cn_index, sizeof(short) * cn, vl);
-                auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+                auto v = common::__riscv_vabd(v1, v2, vl);
                 auto m = __riscv_vle8_v_u8m4(mask + i, vl);
                 auto b = __riscv_vmsne(m, 0, vl);
                 s = __riscv_vwredsumu_tum(b, s, v, s, vl);
@@ -761,7 +761,8 @@ struct MaskedNormDiffL1_RVV<int, double> {
                 vl = __riscv_vsetvl_e32m4(len - i);
                 auto v1 = __riscv_vlse32_v_i32m4(src1 + cn * i + cn_index, sizeof(int) * cn, vl);
                 auto v2 = __riscv_vlse32_v_i32m4(src2 + cn * i + cn_index, sizeof(int) * cn, vl);
-                auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+                auto v = common::__riscv_vabd(v1, v2, vl); // 5.x
+                // auto v = common::__riscv_vabs(__riscv_vsub(v1, v2, vl), vl); // 4.x
                 auto m = __riscv_vle8_v_u8m1(mask + i, vl);
                 auto b = __riscv_vmsne(m, 0, vl);
                 s = __riscv_vfadd_tumu(b, s, s, __riscv_vfwcvt_f(b, v, vl), vl);
@@ -836,7 +837,7 @@ struct MaskedNormDiffL2_RVV<uchar, int> {
                 vl = __riscv_vsetvl_e8m4(len - i);
                 auto v1 = __riscv_vle8_v_u8m4(src1 + i, vl);
                 auto v2 = __riscv_vle8_v_u8m4(src2 + i, vl);
-                auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+                auto v = common::__riscv_vabd(v1, v2, vl);
                 auto m = __riscv_vle8_v_u8m4(mask + i, vl);
                 auto b = __riscv_vmsne(m, 0, vl);
                 s = __riscv_vwredsumu(b, __riscv_vwmulu(b, v, v, vl), s, vl);
@@ -847,7 +848,7 @@ struct MaskedNormDiffL2_RVV<uchar, int> {
                 vl = __riscv_vsetvl_e8m1(len - i);
                 auto v1 = __riscv_vle8_v_u8m4(src1 + i * 4, vl * 4);
                 auto v2 = __riscv_vle8_v_u8m4(src2 + i * 4, vl * 4);
-                auto v = custom_intrin::__riscv_vabd(v1, v2, vl * 4);
+                auto v = common::__riscv_vabd(v1, v2, vl * 4);
                 auto m = __riscv_vle8_v_u8m1(mask + i, vl);
                 auto b = __riscv_vmsne(__riscv_vreinterpret_u8m4(__riscv_vmul(__riscv_vzext_vf4(__riscv_vminu(m, 1, vl), vl), 0x01010101, vl)), 0, vl * 4);
                 s = __riscv_vwredsumu(b, __riscv_vwmulu(b, v, v, vl * 4), s, vl * 4);
@@ -859,7 +860,7 @@ struct MaskedNormDiffL2_RVV<uchar, int> {
                     vl = __riscv_vsetvl_e8m4(len - i);
                     auto v1 = __riscv_vlse8_v_u8m4(src1 + cn * i + cn_index, sizeof(uchar) * cn, vl);
                     auto v2 = __riscv_vlse8_v_u8m4(src2 + cn * i + cn_index, sizeof(uchar) * cn, vl);
-                    auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+                    auto v = common::__riscv_vabd(v1, v2, vl);
                     auto m = __riscv_vle8_v_u8m4(mask + i, vl);
                     auto b = __riscv_vmsne(m, 0, vl);
                     s = __riscv_vwredsumu(b, __riscv_vwmulu(b, v, v, vl), s, vl);
@@ -880,7 +881,7 @@ struct MaskedNormDiffL2_RVV<schar, int> {
                 vl = __riscv_vsetvl_e8m4(len - i);
                 auto v1 = __riscv_vlse8_v_i8m4(src1 + cn * i + cn_index, sizeof(schar) * cn, vl);
                 auto v2 = __riscv_vlse8_v_i8m4(src2 + cn * i + cn_index, sizeof(schar) * cn, vl);
-                auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+                auto v = common::__riscv_vabd(v1, v2, vl);
                 auto m = __riscv_vle8_v_u8m4(mask + i, vl);
                 auto b = __riscv_vmsne(m, 0, vl);
                 s = __riscv_vwredsumu(b, __riscv_vwmulu(b, v, v, vl), s, vl);
@@ -901,7 +902,7 @@ struct MaskedNormDiffL2_RVV<ushort, double> {
                 vl = __riscv_vsetvl_e16m2(len - i);
                 auto v1 = __riscv_vlse16_v_u16m2(src1 + cn * i + cn_index, sizeof(ushort) * cn, vl);
                 auto v2 = __riscv_vlse16_v_u16m2(src2 + cn * i + cn_index, sizeof(ushort) * cn, vl);
-                auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+                auto v = common::__riscv_vabd(v1, v2, vl);
                 auto m = __riscv_vle8_v_u8m1(mask + i, vl);
                 auto b = __riscv_vmsne(m, 0, vl);
                 auto v_mul = __riscv_vwmulu(b, v, v, vl);
@@ -923,7 +924,7 @@ struct MaskedNormDiffL2_RVV<short, double> {
                 vl = __riscv_vsetvl_e16m2(len - i);
                 auto v1 = __riscv_vlse16_v_i16m2(src1 + cn * i + cn_index, sizeof(short) * cn, vl);
                 auto v2 = __riscv_vlse16_v_i16m2(src2 + cn * i + cn_index, sizeof(short) * cn, vl);
-                auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+                auto v = common::__riscv_vabd(v1, v2, vl);
                 auto m = __riscv_vle8_v_u8m1(mask + i, vl);
                 auto b = __riscv_vmsne(m, 0, vl);
                 auto v_mul = __riscv_vwmulu(b, v, v, vl);
@@ -945,7 +946,7 @@ struct MaskedNormDiffL2_RVV<int, double> {
                 vl = __riscv_vsetvl_e16m2(len - i);
                 auto v1 = __riscv_vlse32_v_i32m4(src1 + cn * i + cn_index, sizeof(int) * cn, vl);
                 auto v2 = __riscv_vlse32_v_i32m4(src2 + cn * i + cn_index, sizeof(int) * cn, vl);
-                auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+                auto v = common::__riscv_vabd(v1, v2, vl);
                 auto m = __riscv_vle8_v_u8m1(mask + i, vl);
                 auto b = __riscv_vmsne(m, 0, vl);
                 auto v_mul = __riscv_vwmulu(b, v, v, vl);
@@ -1079,9 +1080,8 @@ CV_HAL_RVV_DEF_NORM_DIFF_ALL(64f, double, double, double, double)
 }
 
 using NormDiffFunc = int (*)(const uchar*, const uchar*, const uchar*, uchar*, int, int);
-inline int normDiff(const uchar* src1, size_t src1_step, const uchar* src2, size_t src2_step, const uchar* mask,
-                    size_t mask_step, int width, int height, int type, int norm_type, double* result)
-{
+int normDiff(const uchar* src1, size_t src1_step, const uchar* src2, size_t src2_step, const uchar* mask, size_t mask_step,
+             int width, int height, int type, int norm_type, double* result) {
     int depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
 
     bool relative = norm_type & NORM_RELATIVE;
@@ -1115,19 +1115,9 @@ inline int normDiff(const uchar* src1, size_t src1_step, const uchar* src2, size
         },
     };
 
-    static const size_t elem_size_tab[CV_DEPTH_MAX] = {
-        sizeof(uchar),      sizeof(schar),
-        sizeof(ushort),     sizeof(short),
-        sizeof(int),        sizeof(float),
-        sizeof(double),     sizeof(cv::hfloat),
-        sizeof(cv::bfloat), sizeof(bool),
-        sizeof(uint64_t),   sizeof(int64_t),
-        sizeof(unsigned),   0,
-    };
-    CV_Assert(elem_size_tab[depth]);
-
-    bool src_continuous = (src1_step == width * elem_size_tab[depth] * cn || (src1_step != width * elem_size_tab[depth] * cn && height == 1));
-    src_continuous &= (src2_step == width * elem_size_tab[depth] * cn || (src2_step != width * elem_size_tab[depth] * cn && height == 1));
+    size_t elem_size1 = static_cast<size_t>(CV_ELEM_SIZE1(type));
+    bool src_continuous = (src1_step == width * elem_size1 * cn || (src1_step != width * elem_size1 * cn && height == 1));
+    src_continuous &= (src2_step == width * elem_size1 * cn || (src2_step != width * elem_size1 * cn && height == 1));
     bool mask_continuous = (mask_step == static_cast<size_t>(width));
     size_t nplanes = 1;
     size_t size = width * height;
@@ -1150,7 +1140,7 @@ inline int normDiff(const uchar* src1, size_t src1_step, const uchar* src2, size
     res.d = 0;
     if ((norm_type == NORM_L1 && depth <= CV_16S) ||
         ((norm_type == NORM_L2 || norm_type == NORM_L2SQR) && depth <= CV_8S)) {
-        const size_t esz = elem_size_tab[depth] * cn;
+        const size_t esz = elem_size1 * cn;
         const int total = (int)size;
         const int intSumBlockSize = (norm_type == NORM_L1 && depth <= CV_8S ? (1 << 23) : (1 << 15))/cn;
         const int blockSize = std::min(total, intSumBlockSize);
@@ -1210,7 +1200,7 @@ inline int normDiff(const uchar* src1, size_t src1_step, const uchar* src2, size
     if(relative)
     {
         double result_;
-        int ret = cv::cv_hal_rvv::norm::norm(src2, src2_step, mask, mask_step, width, height, type, norm_type, &result_);
+        int ret = cv::rvv_hal::core::norm(src2, src2_step, mask, mask_step, width, height, type, norm_type, &result_);
         if(ret == CV_HAL_ERROR_OK)
         {
             *result /= result_ + DBL_EPSILON;
@@ -1220,6 +1210,6 @@ inline int normDiff(const uchar* src1, size_t src1_step, const uchar* src2, size
     return CV_HAL_ERROR_OK;
 }
 
-}}}
+#endif // CV_HAL_RVV_1P0_ENABLED
 
-#endif
+}}} // cv::rvv_hal::core
diff --git a/hal/riscv-rvv/hal_rvv_1p0/norm_hamming.hpp b/hal/riscv-rvv/src/core/norm_hamming.cpp
similarity index 89%
rename from hal/riscv-rvv/hal_rvv_1p0/norm_hamming.hpp
rename to hal/riscv-rvv/src/core/norm_hamming.cpp
index 9c19f62b7e..7a0951f3bc 100644
--- a/hal/riscv-rvv/hal_rvv_1p0/norm_hamming.hpp
+++ b/hal/riscv-rvv/src/core/norm_hamming.cpp
@@ -4,18 +4,11 @@
 
 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
 
-#ifndef OPENCV_HAL_RVV_NORM_HAMMING_HPP_INCLUDED
-#define OPENCV_HAL_RVV_NORM_HAMMING_HPP_INCLUDED
+#include "rvv_hal.hpp"
 
-#include <riscv_vector.h>
-#include <opencv2/core/base.hpp>
+namespace cv { namespace rvv_hal { namespace core {
 
-namespace cv { namespace cv_hal_rvv {
-
-#undef cv_hal_normHamming8u
-#define cv_hal_normHamming8u cv::cv_hal_rvv::normHamming8u
-#undef cv_hal_normHammingDiff8u
-#define cv_hal_normHammingDiff8u cv::cv_hal_rvv::normHammingDiff8u
+#if CV_HAL_RVV_1P0_ENABLED
 
 template <typename CellType>
 inline void normHammingCnt_m8(vuint8m8_t v, vbool1_t mask, size_t len_bool, size_t& result)
@@ -153,7 +146,7 @@ inline void normHammingDiff8uLoop(const uchar* a, const uchar* b, size_t n, size
     }
 }
 
-inline int normHamming8u(const uchar* a, int n, int cellSize, int* result)
+int normHamming8u(const uchar* a, int n, int cellSize, int* result)
 {
     size_t _result = 0;
 
@@ -168,7 +161,7 @@ inline int normHamming8u(const uchar* a, int n, int cellSize, int* result)
     return CV_HAL_ERROR_OK;
 }
 
-inline int normHammingDiff8u(const uchar* a, const uchar* b, int n, int cellSize, int* result)
+int normHammingDiff8u(const uchar* a, const uchar* b, int n, int cellSize, int* result)
 {
     size_t _result = 0;
 
@@ -183,6 +176,6 @@ inline int normHammingDiff8u(const uchar* a, const uchar* b, int n, int cellSize
     return CV_HAL_ERROR_OK;
 }
 
-}}  // namespace cv::cv_hal_rvv
+#endif // CV_HAL_RVV_1P0_ENABLED
 
-#endif //OPENCV_HAL_RVV_NORM_HAMMING_HPP_INCLUDED
+}}}  // cv::rvv_hal::core
diff --git a/hal/riscv-rvv/hal_rvv_1p0/sincos.hpp b/hal/riscv-rvv/src/core/polar_to_cart.cpp
similarity index 61%
rename from hal/riscv-rvv/hal_rvv_1p0/sincos.hpp
rename to hal/riscv-rvv/src/core/polar_to_cart.cpp
index 776d58f42c..bb5824ca49 100644
--- a/hal/riscv-rvv/hal_rvv_1p0/sincos.hpp
+++ b/hal/riscv-rvv/src/core/polar_to_cart.cpp
@@ -1,16 +1,16 @@
 // This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level
-// directory of this distribution and at http://opencv.org/license.html.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
 
 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
 
-#ifndef OPENCV_HAL_RVV_SINCOS_HPP_INCLUDED
-#define OPENCV_HAL_RVV_SINCOS_HPP_INCLUDED
+#include "rvv_hal.hpp"
 
-#include <riscv_vector.h>
-#include "hal_rvv_1p0/types.hpp"
+namespace cv { namespace rvv_hal { namespace core {
 
-namespace cv { namespace cv_hal_rvv { namespace detail {
+#if CV_HAL_RVV_1P0_ENABLED
+
+namespace {
 
 static constexpr size_t sincos_mask = 0x3;
 
@@ -67,6 +67,44 @@ static inline void
     cosval = __riscv_vfneg_mu(__riscv_vmor(idx1, idx2, vl), cosval, cosval, vl);
 }
 
-}}}  // namespace cv::cv_hal_rvv::detail
+template <typename RVV_T, typename Elem = typename RVV_T::ElemType>
+inline int polarToCart(const Elem* mag, const Elem* angle, Elem* x, Elem* y, int len, bool angleInDegrees)
+{
+    using T = RVV_F32M4;
+    const auto sincos_scale = angleInDegrees ? sincos_deg_scale : sincos_rad_scale;
 
-#endif  // OPENCV_HAL_RVV_SINCOS_HPP_INCLUDED
+    size_t vl;
+    auto cos_p2 = T::vmv(sincos_cos_p2, T::setvlmax());
+    auto cos_p0 = T::vmv(sincos_cos_p0, T::setvlmax());
+    for (; len > 0; len -= (int)vl, angle += vl, x += vl, y += vl)
+    {
+        vl = RVV_T::setvl(len);
+        auto vangle = T::cast(RVV_T::vload(angle, vl), vl);
+        T::VecType vsin, vcos;
+        SinCos32f<T>(vangle, vsin, vcos, sincos_scale, cos_p2, cos_p0, vl);
+        if (mag)
+        {
+            auto vmag = T::cast(RVV_T::vload(mag, vl), vl);
+            vsin = __riscv_vfmul(vsin, vmag, vl);
+            vcos = __riscv_vfmul(vcos, vmag, vl);
+            mag += vl;
+        }
+        RVV_T::vstore(x, RVV_T::cast(vcos, vl), vl);
+        RVV_T::vstore(y, RVV_T::cast(vsin, vl), vl);
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+} // anonymous
+
+int polarToCart32f(const float* mag, const float* angle, float* x, float* y, int len, bool angleInDegrees) {
+    return polarToCart<RVV_F32M4>(mag, angle, x, y, len, angleInDegrees);
+}
+int polarToCart64f(const double* mag, const double* angle, double* x, double* y, int len, bool angleInDegrees) {
+    return polarToCart<RVV_F64M8>(mag, angle, x, y, len, angleInDegrees);
+}
+
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+}}}  // cv::rvv_hal::core
diff --git a/hal/riscv-rvv/hal_rvv_1p0/qr.hpp b/hal/riscv-rvv/src/core/qr.cpp
similarity index 91%
rename from hal/riscv-rvv/hal_rvv_1p0/qr.hpp
rename to hal/riscv-rvv/src/core/qr.cpp
index a7085e062b..1bb471a5aa 100644
--- a/hal/riscv-rvv/hal_rvv_1p0/qr.hpp
+++ b/hal/riscv-rvv/src/core/qr.cpp
@@ -4,22 +4,17 @@
 
 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
 
-#ifndef OPENCV_HAL_RVV_QR_HPP_INCLUDED
-#define OPENCV_HAL_RVV_QR_HPP_INCLUDED
-
+#include "rvv_hal.hpp"
 #include <cfloat>
 #include <cmath>
 #include <typeinfo>
 #include <vector>
-#include <riscv_vector.h>
-#include "hal_rvv_1p0/types.hpp"
 
-namespace cv { namespace cv_hal_rvv { namespace qr {
+namespace cv { namespace rvv_hal { namespace core {
 
-#undef cv_hal_QR32f
-#define cv_hal_QR32f cv::cv_hal_rvv::qr::QR<cv::cv_hal_rvv::RVV_F32M4>
-#undef cv_hal_QR64f
-#define cv_hal_QR64f cv::cv_hal_rvv::qr::QR<cv::cv_hal_rvv::RVV_F64M4>
+#if CV_HAL_RVV_1P0_ENABLED
+
+namespace {
 
 // the algorithm is copied from core/src/matrix_decomp.cpp,
 // in the function template static int cv::QRImpl
@@ -171,6 +166,15 @@ inline int QR(T* src1, size_t src1_step, int m, int n, int k, T* src2, size_t sr
     return CV_HAL_ERROR_OK;
 }
 
-}}}
+} // anonymous
 
-#endif
+int QR32f(float* src1, size_t src1_step, int m, int n, int k, float* src2, size_t src2_step, float* dst, int* info) {
+    return QR<RVV_F32M4>(src1, src1_step, m, n, k, src2, src2_step, dst, info);
+}
+int QR64f(double* src1, size_t src1_step, int m, int n, int k, double* src2, size_t src2_step, double* dst, int* info) {
+    return QR<RVV_F64M4>(src1, src1_step, m, n, k, src2, src2_step, dst, info);
+}
+
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+}}} // cv::rvv_hal::core
diff --git a/hal/riscv-rvv/hal_rvv_1p0/split.hpp b/hal/riscv-rvv/src/core/split.cpp
similarity index 91%
rename from hal/riscv-rvv/hal_rvv_1p0/split.hpp
rename to hal/riscv-rvv/src/core/split.cpp
index 9646fd9f67..1a843c939e 100644
--- a/hal/riscv-rvv/hal_rvv_1p0/split.hpp
+++ b/hal/riscv-rvv/src/core/split.cpp
@@ -1,17 +1,14 @@
 // This file is part of OpenCV project.
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
-#ifndef OPENCV_HAL_RVV_SPLIT_HPP_INCLUDED
-#define OPENCV_HAL_RVV_SPLIT_HPP_INCLUDED
 
-#include <riscv_vector.h>
+#include "rvv_hal.hpp"
 
-namespace cv { namespace cv_hal_rvv {
+namespace cv { namespace rvv_hal { namespace core {
 
-#undef cv_hal_split8u
-#define cv_hal_split8u cv::cv_hal_rvv::split8u
+#if CV_HAL_RVV_1P0_ENABLED
 
-inline int split8u(const uchar* src, uchar** dst, int len, int cn)
+int split8u(const uchar* src, uchar** dst, int len, int cn)
 {
     int vl = 0;
     if (cn == 1)
@@ -89,5 +86,6 @@ inline int split8u(const uchar* src, uchar** dst, int len, int cn)
     return CV_HAL_ERROR_OK;
 }
 
-}}
-#endif
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+}}} // cv::rvv_hal::core
diff --git a/hal/riscv-rvv/src/core/sqrt.cpp b/hal/riscv-rvv/src/core/sqrt.cpp
new file mode 100644
index 0000000000..7186f1bcca
--- /dev/null
+++ b/hal/riscv-rvv/src/core/sqrt.cpp
@@ -0,0 +1,74 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level
+// directory of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
+
+#include "rvv_hal.hpp"
+#include "common.hpp"
+
+namespace cv { namespace rvv_hal { namespace core {
+
+#if CV_HAL_RVV_1P0_ENABLED
+
+namespace {
+
+template <typename SQRT_T, typename Elem = typename SQRT_T::T::ElemType>
+inline int sqrt(const Elem* src, Elem* dst, int _len)
+{
+    size_t vl;
+    for (size_t len = _len; len > 0; len -= vl, src += vl, dst += vl)
+    {
+        vl = SQRT_T::T::setvl(len);
+        auto x = SQRT_T::T::vload(src, vl);
+        SQRT_T::T::vstore(dst, common::sqrt<SQRT_T::iter_times>(x, vl), vl);
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+template <typename SQRT_T, typename Elem = typename SQRT_T::T::ElemType>
+inline int invSqrt(const Elem* src, Elem* dst, int _len)
+{
+    size_t vl;
+    for (size_t len = _len; len > 0; len -= vl, src += vl, dst += vl)
+    {
+        vl = SQRT_T::T::setvl(len);
+        auto x = SQRT_T::T::vload(src, vl);
+        SQRT_T::T::vstore(dst, common::invSqrt<SQRT_T::iter_times>(x, vl), vl);
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+} // anonymous
+
+int sqrt32f(const float* src, float* dst, int len) {
+    return sqrt<common::Sqrt32f<RVV_F32M8>>(src, dst, len);
+}
+int sqrt64f(const double* src, double* dst, int len) {
+    return sqrt<common::Sqrt64f<RVV_F64M8>>(src, dst, len);
+}
+
+int invSqrt32f(const float* src, float* dst, int len) {
+#ifdef __clang__
+// Strange bug in clang: invSqrt use 2 LMUL registers to store mask, which will cause memory access.
+// So a smaller LMUL is used here.
+    return invSqrt<common::Sqrt32f<RVV_F32M4>>(src, dst, len);
+#else
+    return invSqrt<common::Sqrt32f<RVV_F32M8>>(src, dst, len);
+#endif
+}
+int invSqrt64f(const double* src, double* dst, int len) {
+#ifdef __clang__
+// Strange bug in clang: invSqrt use 2 LMUL registers to store mask, which will cause memory access.
+// So a smaller LMUL is used here.
+    return invSqrt<common::Sqrt64f<RVV_F64M4>>(src, dst, len);
+#else
+    return invSqrt<common::Sqrt64f<RVV_F64M8>>(src, dst, len);
+#endif
+}
+
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+}}}  // cv::rvv_hal::core
diff --git a/hal/riscv-rvv/hal_rvv_1p0/svd.hpp b/hal/riscv-rvv/src/core/svd.cpp
similarity index 93%
rename from hal/riscv-rvv/hal_rvv_1p0/svd.hpp
rename to hal/riscv-rvv/src/core/svd.cpp
index 2ecad0671e..8454b60a85 100644
--- a/hal/riscv-rvv/hal_rvv_1p0/svd.hpp
+++ b/hal/riscv-rvv/src/core/svd.cpp
@@ -4,22 +4,17 @@
 
 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
 
-#ifndef OPENCV_HAL_RVV_SVD_HPP_INCLUDED
-#define OPENCV_HAL_RVV_SVD_HPP_INCLUDED
-
+#include "rvv_hal.hpp"
 #include <cfloat>
 #include <cmath>
 #include <typeinfo>
 #include <vector>
-#include <riscv_vector.h>
-#include "hal_rvv_1p0/types.hpp"
 
-namespace cv { namespace cv_hal_rvv { namespace svd {
+namespace cv { namespace rvv_hal { namespace core {
 
-#undef cv_hal_SVD32f
-#define cv_hal_SVD32f cv::cv_hal_rvv::svd::SVD<cv::cv_hal_rvv::RVV_F32M4>
-#undef cv_hal_SVD64f
-#define cv_hal_SVD64f cv::cv_hal_rvv::svd::SVD<cv::cv_hal_rvv::RVV_F64M4>
+#if CV_HAL_RVV_1P0_ENABLED
+
+namespace {
 
 // the algorithm is copied from core/src/lapack.cpp,
 // in the function template static void cv::JacobiSVDImpl_
@@ -268,6 +263,15 @@ inline int SVD(T* src, size_t src_step, T* w, T*, size_t, T* vt, size_t vt_step,
     return CV_HAL_ERROR_OK;
 }
 
-}}}
+} // anonymous
 
-#endif
+int SVD32f(float* src, size_t src_step, float* w, float* u, size_t u_step, float* vt, size_t vt_step, int m, int n, int flags) {
+    return SVD<RVV_F32M4>(src, src_step, w, u, u_step, vt, vt_step, m, n, flags);
+}
+int SVD64f(double* src, size_t src_step, double* w, double* u, size_t u_step, double* vt, size_t vt_step, int m, int n, int flags) {
+    return SVD<RVV_F64M4>(src, src_step, w, u, u_step, vt, vt_step, m, n, flags);
+}
+
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+}}} // cv::rvv_hal::core
diff --git a/hal/riscv-rvv/hal_rvv_1p0/transpose.hpp b/hal/riscv-rvv/src/core/transpose.cpp
similarity index 71%
rename from hal/riscv-rvv/hal_rvv_1p0/transpose.hpp
rename to hal/riscv-rvv/src/core/transpose.cpp
index 10bf9b4d3e..4f7ccd63d2 100644
--- a/hal/riscv-rvv/hal_rvv_1p0/transpose.hpp
+++ b/hal/riscv-rvv/src/core/transpose.cpp
@@ -5,12 +5,7 @@
 // Copyright (C) 2025, SpaceMIT Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 
-#ifndef OPENCV_HAL_RVV_TRANSPOSE_HPP_INCLUDED
-#define OPENCV_HAL_RVV_TRANSPOSE_HPP_INCLUDED
-
-#include <riscv_vector.h>
-
-namespace cv { namespace cv_hal_rvv { namespace transpose {
+#include "rvv_hal.hpp"
 
 #if defined (__clang__) && __clang_major__ < 18
 #define OPENCV_HAL_IMPL_RVV_VCREATE_x4(suffix, width, v0, v1, v2, v3) \
@@ -35,18 +30,22 @@ namespace cv { namespace cv_hal_rvv { namespace transpose {
 #define __riscv_vcreate_v_i64m1x8(v0, v1, v2, v3, v4, v5, v6, v7) OPENCV_HAL_IMPL_RVV_VCREATE_x8(i64, 1, v0, v1, v2, v3, v4, v5, v6, v7)
 #endif
 
+namespace cv { namespace rvv_hal { namespace core {
+
+#if CV_HAL_RVV_1P0_ENABLED
+
 static void transpose2d_8u(const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step, int src_width, int src_height) {
-    auto transpose_8u_8xVl = [](const uchar *src, size_t src_step, uchar *dst, size_t dst_step, const int vl) {
+    auto transpose_8u_8xVl = [](const uchar *src, size_t sstep, uchar *dst, size_t dstep, const int vl) {
         auto v0 = __riscv_vle8_v_u8m1(src, vl);
-        auto v1 = __riscv_vle8_v_u8m1(src + src_step, vl);
-        auto v2 = __riscv_vle8_v_u8m1(src + 2 * src_step, vl);
-        auto v3 = __riscv_vle8_v_u8m1(src + 3 * src_step, vl);
-        auto v4 = __riscv_vle8_v_u8m1(src + 4 * src_step, vl);
-        auto v5 = __riscv_vle8_v_u8m1(src + 5 * src_step, vl);
-        auto v6 = __riscv_vle8_v_u8m1(src + 6 * src_step, vl);
-        auto v7 = __riscv_vle8_v_u8m1(src + 7 * src_step, vl);
+        auto v1 = __riscv_vle8_v_u8m1(src + sstep, vl);
+        auto v2 = __riscv_vle8_v_u8m1(src + 2 * sstep, vl);
+        auto v3 = __riscv_vle8_v_u8m1(src + 3 * sstep, vl);
+        auto v4 = __riscv_vle8_v_u8m1(src + 4 * sstep, vl);
+        auto v5 = __riscv_vle8_v_u8m1(src + 5 * sstep, vl);
+        auto v6 = __riscv_vle8_v_u8m1(src + 6 * sstep, vl);
+        auto v7 = __riscv_vle8_v_u8m1(src + 7 * sstep, vl);
         vuint8m1x8_t v = __riscv_vcreate_v_u8m1x8(v0, v1, v2, v3, v4, v5, v6, v7);
-        __riscv_vssseg8e8(dst, dst_step, v, vl);
+        __riscv_vssseg8e8(dst, dstep, v, vl);
     };
 
     int h = 0, w = 0;
@@ -72,17 +71,17 @@ static void transpose2d_8u(const uchar *src_data, size_t src_step, uchar *dst_da
 }
 
 static void transpose2d_16u(const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step, int src_width, int src_height) {
-    auto transpose_16u_8xVl = [](const ushort *src, size_t src_step, ushort *dst, size_t dst_step, const int vl) {
+    auto transpose_16u_8xVl = [](const ushort *src, size_t sstep, ushort *dst, size_t dstep, const int vl) {
         auto v0 = __riscv_vle16_v_u16m1(src, vl);
-        auto v1 = __riscv_vle16_v_u16m1(src + src_step, vl);
-        auto v2 = __riscv_vle16_v_u16m1(src + 2 * src_step, vl);
-        auto v3 = __riscv_vle16_v_u16m1(src + 3 * src_step, vl);
-        auto v4 = __riscv_vle16_v_u16m1(src + 4 * src_step, vl);
-        auto v5 = __riscv_vle16_v_u16m1(src + 5 * src_step, vl);
-        auto v6 = __riscv_vle16_v_u16m1(src + 6 * src_step, vl);
-        auto v7 = __riscv_vle16_v_u16m1(src + 7 * src_step, vl);
+        auto v1 = __riscv_vle16_v_u16m1(src + sstep, vl);
+        auto v2 = __riscv_vle16_v_u16m1(src + 2 * sstep, vl);
+        auto v3 = __riscv_vle16_v_u16m1(src + 3 * sstep, vl);
+        auto v4 = __riscv_vle16_v_u16m1(src + 4 * sstep, vl);
+        auto v5 = __riscv_vle16_v_u16m1(src + 5 * sstep, vl);
+        auto v6 = __riscv_vle16_v_u16m1(src + 6 * sstep, vl);
+        auto v7 = __riscv_vle16_v_u16m1(src + 7 * sstep, vl);
         vuint16m1x8_t v = __riscv_vcreate_v_u16m1x8(v0, v1, v2, v3, v4, v5, v6, v7);
-        __riscv_vssseg8e16(dst, dst_step, v, vl);
+        __riscv_vssseg8e16(dst, dstep, v, vl);
     };
 
     size_t src_step_base = src_step / sizeof(ushort);
@@ -111,13 +110,13 @@ static void transpose2d_16u(const uchar *src_data, size_t src_step, uchar *dst_d
 }
 
 static void transpose2d_32s(const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step, int src_width, int src_height) {
-    auto transpose_32s_4xVl = [](const int *src, size_t src_step, int *dst, size_t dst_step, const int vl) {
+    auto transpose_32s_4xVl = [](const int *src, size_t sstep, int *dst, size_t dstep, const int vl) {
         auto v0 = __riscv_vle32_v_i32m1(src, vl);
-        auto v1 = __riscv_vle32_v_i32m1(src + src_step, vl);
-        auto v2 = __riscv_vle32_v_i32m1(src + 2 * src_step, vl);
-        auto v3 = __riscv_vle32_v_i32m1(src + 3 * src_step, vl);
+        auto v1 = __riscv_vle32_v_i32m1(src + sstep, vl);
+        auto v2 = __riscv_vle32_v_i32m1(src + 2 * sstep, vl);
+        auto v3 = __riscv_vle32_v_i32m1(src + 3 * sstep, vl);
         vint32m1x4_t v = __riscv_vcreate_v_i32m1x4(v0, v1, v2, v3);
-        __riscv_vssseg4e32(dst, dst_step, v, vl);
+        __riscv_vssseg4e32(dst, dstep, v, vl);
     };
 
     size_t src_step_base = src_step / sizeof(int);
@@ -146,17 +145,17 @@ static void transpose2d_32s(const uchar *src_data, size_t src_step, uchar *dst_d
 }
 
 static void transpose2d_32sC2(const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step, int src_width, int src_height) {
-    auto transpose_64s_8xVl = [](const int64_t *src, size_t src_step, int64_t *dst, size_t dst_step, const int vl) {
+    auto transpose_64s_8xVl = [](const int64_t *src, size_t sstep, int64_t *dst, size_t dstep, const int vl) {
         auto v0 = __riscv_vle64_v_i64m1(src, vl);
-        auto v1 = __riscv_vle64_v_i64m1(src + src_step, vl);
-        auto v2 = __riscv_vle64_v_i64m1(src + 2 * src_step, vl);
-        auto v3 = __riscv_vle64_v_i64m1(src + 3 * src_step, vl);
-        auto v4 = __riscv_vle64_v_i64m1(src + 4 * src_step, vl);
-        auto v5 = __riscv_vle64_v_i64m1(src + 5 * src_step, vl);
-        auto v6 = __riscv_vle64_v_i64m1(src + 6 * src_step, vl);
-        auto v7 = __riscv_vle64_v_i64m1(src + 7 * src_step, vl);
+        auto v1 = __riscv_vle64_v_i64m1(src + sstep, vl);
+        auto v2 = __riscv_vle64_v_i64m1(src + 2 * sstep, vl);
+        auto v3 = __riscv_vle64_v_i64m1(src + 3 * sstep, vl);
+        auto v4 = __riscv_vle64_v_i64m1(src + 4 * sstep, vl);
+        auto v5 = __riscv_vle64_v_i64m1(src + 5 * sstep, vl);
+        auto v6 = __riscv_vle64_v_i64m1(src + 6 * sstep, vl);
+        auto v7 = __riscv_vle64_v_i64m1(src + 7 * sstep, vl);
         vint64m1x8_t v = __riscv_vcreate_v_i64m1x8(v0, v1, v2, v3, v4, v5, v6, v7);
-        __riscv_vssseg8e64(dst, dst_step, v, vl);
+        __riscv_vssseg8e64(dst, dstep, v, vl);
     };
 
     size_t src_step_base = src_step / sizeof(int64_t);
@@ -184,12 +183,9 @@ static void transpose2d_32sC2(const uchar *src_data, size_t src_step, uchar *dst
     }
 }
 
-#undef cv_hal_transpose2d
-#define cv_hal_transpose2d cv::cv_hal_rvv::transpose::transpose2d
-
 using Transpose2dFunc = void (*)(const uchar*, size_t, uchar*, size_t, int, int);
-inline int transpose2d(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step,
-                       int src_width, int src_height, int element_size) {
+int transpose2d(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step,
+                int src_width, int src_height, int element_size) {
     if (src_data == dst_data) {
         return CV_HAL_ERROR_NOT_IMPLEMENTED;
     }
@@ -205,7 +201,7 @@ inline int transpose2d(const uchar* src_data, size_t src_step, uchar* dst_data,
         0, 0, 0, 0,
         0
     };
-    Transpose2dFunc func = tab[element_size];
+    Transpose2dFunc func = element_size <= 32 ? tab[element_size] : nullptr;
     if (!func) {
         return CV_HAL_ERROR_NOT_IMPLEMENTED;
     }
@@ -215,6 +211,6 @@ inline int transpose2d(const uchar* src_data, size_t src_step, uchar* dst_data,
     return CV_HAL_ERROR_OK;
 }
 
-}}} // cv::cv_hal_rvv::transpose
+#endif // CV_HAL_RVV_1P0_ENABLED
 
-#endif // OPENCV_HAL_RVV_TRANSPOSE_HPP_INCLUDED
+}}} // cv::rvv_hal::core
diff --git a/hal/riscv-rvv/src/imgproc/bilateral_filter.cpp b/hal/riscv-rvv/src/imgproc/bilateral_filter.cpp
new file mode 100644
index 0000000000..0756f2e6c0
--- /dev/null
+++ b/hal/riscv-rvv/src/imgproc/bilateral_filter.cpp
@@ -0,0 +1,361 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
+
+#include "rvv_hal.hpp"
+#include "common.hpp"
+
+namespace cv { namespace rvv_hal { namespace imgproc {
+
+#if CV_HAL_RVV_1P0_ENABLED
+
+namespace {
+
+// the algorithm is copied from imgproc/src/bilateral_filter.simd.cpp
+// in the functor BilateralFilter_8u_Invoker
+static inline int bilateralFilter8UC1(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int radius, int maxk, const int* space_ofs, const float* space_weight, const float* color_weight)
+{
+    constexpr int align = 31;
+    std::vector<float> _sum(width + align), _wsum(width + align);
+    float* sum = reinterpret_cast<float*>(((size_t)_sum.data() + align) & ~align);
+    float* wsum = reinterpret_cast<float*>(((size_t)_wsum.data() + align) & ~align);
+
+    for (int i = start; i < end; i++)
+    {
+        const uchar* sptr = src_data + (i+radius) * src_step + radius;
+        memset(sum, 0, sizeof(float) * width);
+        memset(wsum, 0, sizeof(float) * width);
+        for(int k = 0; k < maxk; k++)
+        {
+            const uchar* ksptr = sptr + space_ofs[k];
+            int vl;
+            for (int j = 0; j < width; j += vl)
+            {
+                vl = __riscv_vsetvl_e8m2(width - j);
+                auto src = __riscv_vle8_v_u8m2(sptr + j, vl);
+                auto ksrc = __riscv_vle8_v_u8m2(ksptr + j, vl);
+                auto diff = __riscv_vsub(__riscv_vmaxu(src, ksrc, vl), __riscv_vminu(src, ksrc, vl), vl);
+                auto w = __riscv_vloxei16_v_f32m8(color_weight, __riscv_vmul(__riscv_vzext_vf2(diff, vl), sizeof(float), vl), vl);
+                w = __riscv_vfmul(w, space_weight[k], vl);
+
+                __riscv_vse32(wsum + j, __riscv_vfadd(w, __riscv_vle32_v_f32m8(wsum + j, vl), vl), vl);
+                __riscv_vse32(sum + j, __riscv_vfmadd(w, __riscv_vfwcvt_f(__riscv_vzext_vf2(ksrc, vl), vl), __riscv_vle32_v_f32m8(sum + j, vl), vl), vl);
+            }
+        }
+
+        int vl;
+        for (int j = 0; j < width; j += vl)
+        {
+            vl = __riscv_vsetvl_e8m2(width - j);
+            auto dst = __riscv_vfncvt_xu(__riscv_vfdiv(__riscv_vle32_v_f32m8(sum + j, vl), __riscv_vle32_v_f32m8(wsum + j, vl), vl), vl);
+            __riscv_vse8(dst_data + i * dst_step + j, __riscv_vncvt_x(dst, vl), vl);
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+static inline int bilateralFilter8UC3(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int radius, int maxk, const int* space_ofs, const float* space_weight, const float* color_weight)
+{
+    constexpr int align = 31;
+    std::vector<float> _sum_b(width + align), _sum_g(width + align), _sum_r(width + align), _wsum(width + align);
+    float* sum_b = reinterpret_cast<float*>(((size_t)_sum_b.data() + align) & ~align);
+    float* sum_g = reinterpret_cast<float*>(((size_t)_sum_g.data() + align) & ~align);
+    float* sum_r = reinterpret_cast<float*>(((size_t)_sum_r.data() + align) & ~align);
+    float* wsum = reinterpret_cast<float*>(((size_t)_wsum.data() + align) & ~align);
+
+    for (int i = start; i < end; i++)
+    {
+        const uchar* sptr = src_data + (i+radius) * src_step + radius*3;
+        memset(sum_b, 0, sizeof(float) * width);
+        memset(sum_g, 0, sizeof(float) * width);
+        memset(sum_r, 0, sizeof(float) * width);
+        memset(wsum, 0, sizeof(float) * width);
+        for(int k = 0; k < maxk; k++)
+        {
+            const uchar* ksptr = sptr + space_ofs[k];
+            int vl;
+            for (int j = 0; j < width; j += vl)
+            {
+                vl = __riscv_vsetvl_e8m2(width - j);
+                auto src = __riscv_vlseg3e8_v_u8m2x3(sptr + j * 3, vl);
+                auto src0 = __riscv_vget_v_u8m2x3_u8m2(src, 0);
+                auto src1 = __riscv_vget_v_u8m2x3_u8m2(src, 1);
+                auto src2 = __riscv_vget_v_u8m2x3_u8m2(src, 2);
+                src = __riscv_vlseg3e8_v_u8m2x3(ksptr + j * 3, vl);
+                auto ksrc0 = __riscv_vget_v_u8m2x3_u8m2(src, 0);
+                auto ksrc1 = __riscv_vget_v_u8m2x3_u8m2(src, 1);
+                auto ksrc2 = __riscv_vget_v_u8m2x3_u8m2(src, 2);
+
+                auto diff0 = __riscv_vsub(__riscv_vmaxu(src0, ksrc0, vl), __riscv_vminu(src0, ksrc0, vl), vl);
+                auto diff1 = __riscv_vsub(__riscv_vmaxu(src1, ksrc1, vl), __riscv_vminu(src1, ksrc1, vl), vl);
+                auto diff2 = __riscv_vsub(__riscv_vmaxu(src2, ksrc2, vl), __riscv_vminu(src2, ksrc2, vl), vl);
+                auto w = __riscv_vloxei16_v_f32m8(color_weight, __riscv_vmul(__riscv_vadd(__riscv_vadd(__riscv_vzext_vf2(diff0, vl), __riscv_vzext_vf2(diff1, vl), vl), __riscv_vzext_vf2(diff2, vl), vl), sizeof(float), vl), vl);
+                w = __riscv_vfmul(w, space_weight[k], vl);
+
+                __riscv_vse32(wsum + j, __riscv_vfadd(w, __riscv_vle32_v_f32m8(wsum + j, vl), vl), vl);
+                __riscv_vse32(sum_b + j, __riscv_vfmadd(w, __riscv_vfwcvt_f(__riscv_vzext_vf2(ksrc0, vl), vl), __riscv_vle32_v_f32m8(sum_b + j, vl), vl), vl);
+                __riscv_vse32(sum_g + j, __riscv_vfmadd(w, __riscv_vfwcvt_f(__riscv_vzext_vf2(ksrc1, vl), vl), __riscv_vle32_v_f32m8(sum_g + j, vl), vl), vl);
+                __riscv_vse32(sum_r + j, __riscv_vfmadd(w, __riscv_vfwcvt_f(__riscv_vzext_vf2(ksrc2, vl), vl), __riscv_vle32_v_f32m8(sum_r + j, vl), vl), vl);
+            }
+        }
+
+        int vl;
+        for (int j = 0; j < width; j += vl)
+        {
+            vl = __riscv_vsetvl_e8m2(width - j);
+            auto w = __riscv_vfrdiv(__riscv_vle32_v_f32m8(wsum + j, vl), 1.0f, vl);
+            vuint8m2x3_t dst{};
+            dst = __riscv_vset_v_u8m2_u8m2x3(dst, 0,__riscv_vncvt_x(__riscv_vfncvt_xu(__riscv_vfmul(__riscv_vle32_v_f32m8(sum_b + j, vl), w, vl), vl), vl));
+            dst = __riscv_vset_v_u8m2_u8m2x3(dst, 1,__riscv_vncvt_x(__riscv_vfncvt_xu(__riscv_vfmul(__riscv_vle32_v_f32m8(sum_g + j, vl), w, vl), vl), vl));
+            dst = __riscv_vset_v_u8m2_u8m2x3(dst, 2,__riscv_vncvt_x(__riscv_vfncvt_xu(__riscv_vfmul(__riscv_vle32_v_f32m8(sum_r + j, vl), w, vl), vl), vl));
+            __riscv_vsseg3e8(dst_data + i * dst_step + j * 3, dst, vl);
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+// the algorithm is copied from imgproc/src/bilateral_filter.simd.cpp
+// in the functor BilateralFilter_32f_Invoker
+static inline int bilateralFilter32FC1(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int radius, int maxk, const int* space_ofs, const float* space_weight, const float* expLUT, float scale_index)
+{
+    constexpr int align = 31;
+    std::vector<float> _sum(width + align), _wsum(width + align);
+    float* sum = reinterpret_cast<float*>(((size_t)_sum.data() + align) & ~align);
+    float* wsum = reinterpret_cast<float*>(((size_t)_wsum.data() + align) & ~align);
+
+    for (int i = start; i < end; i++)
+    {
+        const float* sptr = reinterpret_cast<const float*>(src_data + (i+radius) * src_step) + radius;
+        memset(sum, 0, sizeof(float) * width);
+        memset(wsum, 0, sizeof(float) * width);
+        for(int k = 0; k < maxk; k++)
+        {
+            const float* ksptr = sptr + space_ofs[k];
+            int vl;
+            for (int j = 0; j < width; j += vl)
+            {
+                vl = __riscv_vsetvl_e32m4(width - j);
+                auto src = __riscv_vle32_v_f32m4(sptr + j, vl);
+                auto ksrc = __riscv_vle32_v_f32m4(ksptr + j, vl);
+                auto diff = __riscv_vfmul(__riscv_vfabs(__riscv_vfsub(src, ksrc, vl), vl), scale_index, vl);
+                auto idx = __riscv_vfcvt_rtz_x(diff, vl);
+                auto alpha = __riscv_vfsub(diff, __riscv_vfcvt_f(idx, vl), vl);
+
+                auto exp = __riscv_vloxseg2ei32_v_f32m4x2(expLUT, __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vmul(idx, sizeof(float), vl)), vl);
+                auto w = __riscv_vfmadd(alpha, __riscv_vfsub(__riscv_vget_v_f32m4x2_f32m4(exp, 1), __riscv_vget_v_f32m4x2_f32m4(exp, 0), vl), __riscv_vget_v_f32m4x2_f32m4(exp, 0), vl);
+                w = __riscv_vfmul(w, space_weight[k], vl);
+
+                __riscv_vse32(wsum + j, __riscv_vfadd(w, __riscv_vle32_v_f32m4(wsum + j, vl), vl), vl);
+                __riscv_vse32(sum + j, __riscv_vfmadd(w, ksrc, __riscv_vle32_v_f32m4(sum + j, vl), vl), vl);
+            }
+        }
+
+        int vl;
+        for (int j = 0; j < width; j += vl)
+        {
+            vl = __riscv_vsetvl_e32m4(width - j);
+            auto src = __riscv_vle32_v_f32m4(sptr + j, vl);
+            auto dst = __riscv_vfdiv(__riscv_vfadd(__riscv_vle32_v_f32m4(sum + j, vl), src, vl), __riscv_vfadd(__riscv_vle32_v_f32m4(wsum + j, vl), 1, vl), vl);
+            __riscv_vse32(reinterpret_cast<float*>(dst_data + i * dst_step) + j, dst, vl);
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+static inline int bilateralFilter32FC3(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int radius, int maxk, const int* space_ofs, const float* space_weight, const float* expLUT, float scale_index)
+{
+    constexpr int align = 31;
+    std::vector<float> _sum_b(width + align), _sum_g(width + align), _sum_r(width + align), _wsum(width + align);
+    float* sum_b = reinterpret_cast<float*>(((size_t)_sum_b.data() + align) & ~align);
+    float* sum_g = reinterpret_cast<float*>(((size_t)_sum_g.data() + align) & ~align);
+    float* sum_r = reinterpret_cast<float*>(((size_t)_sum_r.data() + align) & ~align);
+    float* wsum = reinterpret_cast<float*>(((size_t)_wsum.data() + align) & ~align);
+
+    for (int i = start; i < end; i++)
+    {
+        const float* sptr = reinterpret_cast<const float*>(src_data + (i+radius) * src_step) + radius*3;
+        memset(sum_b, 0, sizeof(float) * width);
+        memset(sum_g, 0, sizeof(float) * width);
+        memset(sum_r, 0, sizeof(float) * width);
+        memset(wsum, 0, sizeof(float) * width);
+        for(int k = 0; k < maxk; k++)
+        {
+            const float* ksptr = sptr + space_ofs[k];
+            int vl;
+            for (int j = 0; j < width; j += vl)
+            {
+                vl = __riscv_vsetvl_e32m2(width - j);
+                auto src = __riscv_vlseg3e32_v_f32m2x3(sptr + j * 3, vl);
+                auto src0 = __riscv_vget_v_f32m2x3_f32m2(src, 0);
+                auto src1 = __riscv_vget_v_f32m2x3_f32m2(src, 1);
+                auto src2 = __riscv_vget_v_f32m2x3_f32m2(src, 2);
+                src = __riscv_vlseg3e32_v_f32m2x3(ksptr + j * 3, vl);
+                auto ksrc0 = __riscv_vget_v_f32m2x3_f32m2(src, 0);
+                auto ksrc1 = __riscv_vget_v_f32m2x3_f32m2(src, 1);
+                auto ksrc2 = __riscv_vget_v_f32m2x3_f32m2(src, 2);
+
+                auto diff = __riscv_vfmul(__riscv_vfadd(__riscv_vfadd(__riscv_vfabs(__riscv_vfsub(src0, ksrc0, vl), vl), __riscv_vfabs(__riscv_vfsub(src1, ksrc1, vl), vl), vl), __riscv_vfabs(__riscv_vfsub(src2, ksrc2, vl), vl), vl), scale_index, vl);
+                auto idx = __riscv_vfcvt_rtz_x(diff, vl);
+                auto alpha = __riscv_vfsub(diff, __riscv_vfcvt_f(idx, vl), vl);
+
+                auto exp = __riscv_vloxseg2ei32_v_f32m2x2(expLUT, __riscv_vreinterpret_v_i32m2_u32m2(__riscv_vmul(idx, sizeof(float), vl)), vl);
+                auto w = __riscv_vfmadd(alpha, __riscv_vfsub(__riscv_vget_v_f32m2x2_f32m2(exp, 1), __riscv_vget_v_f32m2x2_f32m2(exp, 0), vl), __riscv_vget_v_f32m2x2_f32m2(exp, 0), vl);
+                w = __riscv_vfmul(w, space_weight[k], vl);
+
+                __riscv_vse32(wsum + j, __riscv_vfadd(w, __riscv_vle32_v_f32m2(wsum + j, vl), vl), vl);
+                __riscv_vse32(sum_b + j, __riscv_vfmadd(w, ksrc0, __riscv_vle32_v_f32m2(sum_b + j, vl), vl), vl);
+                __riscv_vse32(sum_g + j, __riscv_vfmadd(w, ksrc1, __riscv_vle32_v_f32m2(sum_g + j, vl), vl), vl);
+                __riscv_vse32(sum_r + j, __riscv_vfmadd(w, ksrc2, __riscv_vle32_v_f32m2(sum_r + j, vl), vl), vl);
+            }
+        }
+
+        int vl;
+        for (int j = 0; j < width; j += vl)
+        {
+            vl = __riscv_vsetvl_e32m2(width - j);
+            auto w = __riscv_vfrdiv(__riscv_vfadd(__riscv_vle32_v_f32m2(wsum + j, vl), 1, vl), 1, vl);
+            auto src = __riscv_vlseg3e32_v_f32m2x3(sptr + j * 3, vl);
+            auto src0 = __riscv_vget_v_f32m2x3_f32m2(src, 0);
+            auto src1 = __riscv_vget_v_f32m2x3_f32m2(src, 1);
+            auto src2 = __riscv_vget_v_f32m2x3_f32m2(src, 2);
+
+            vfloat32m2x3_t dst{};
+            dst = __riscv_vset_v_f32m2_f32m2x3(dst, 0, __riscv_vfmul(w, __riscv_vfadd(__riscv_vle32_v_f32m2(sum_b + j, vl), src0, vl), vl));
+            dst = __riscv_vset_v_f32m2_f32m2x3(dst, 1, __riscv_vfmul(w, __riscv_vfadd(__riscv_vle32_v_f32m2(sum_g + j, vl), src1, vl), vl));
+            dst = __riscv_vset_v_f32m2_f32m2x3(dst, 2, __riscv_vfmul(w, __riscv_vfadd(__riscv_vle32_v_f32m2(sum_r + j, vl), src2, vl), vl));
+            __riscv_vsseg3e32(reinterpret_cast<float*>(dst_data + i * dst_step) + j * 3, dst, vl);
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+} // anonymous
+
+// the algorithm is copied from imgproc/src/bilateral_filter.dispatch.cpp
+// in the function static void bilateralFilter_8u and bilateralFilter_32f
+int bilateralFilter(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step,
+                    int width, int height, int depth, int cn, int d, double sigma_color, double sigma_space, int border_type)
+{
+    const int type = CV_MAKETYPE(depth, cn);
+    if (type != CV_8UC1 && type != CV_8UC3 && type != CV_32FC1 && type != CV_32FC3)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    if (type == CV_32FC1 && width * height > 1 << 20)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    if (src_data == dst_data || border_type & BORDER_ISOLATED)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    sigma_color = sigma_color <= 0 ? 1 : sigma_color;
+    sigma_space = sigma_space <= 0 ? 1 : sigma_space;
+    double gauss_color_coeff = -0.5/(sigma_color*sigma_color);
+    double gauss_space_coeff = -0.5/(sigma_space*sigma_space);
+    int radius = d <= 0 ? std::round(sigma_space*1.5) : d/2;
+    radius = std::max(radius, 1);
+    d = radius*2 + 1;
+
+    const int size = depth == CV_32F ? cn * sizeof(float) : cn;
+    const int temp_step = (width + radius * 2) * size;
+    std::vector<uchar> _temp((width + radius * 2) * (height + radius * 2) * size, 0);
+    uchar* temp = _temp.data();
+    std::vector<int> width_interpolate(radius * 2);
+    for (int j = 0; j < radius; j++)
+    {
+        width_interpolate[j] = common::borderInterpolate(j - radius, width, border_type);
+        width_interpolate[j + radius] = common::borderInterpolate(width + j, width, border_type);
+    }
+    for (int i = 0; i < height + radius * 2; i++)
+    {
+        int x = common::borderInterpolate(i - radius, height, border_type);
+        if (x != -1)
+        {
+            for (int j = 0; j < radius; j++)
+            {
+                int y = width_interpolate[j];
+                if (y != -1)
+                    memcpy(temp + i * temp_step + j * size, src_data + x * src_step + y * size, size);
+                y = width_interpolate[j + radius];
+                if (y != -1)
+                    memcpy(temp + i * temp_step + (width + j + radius) * size, src_data + x * src_step + y * size, size);
+            }
+            memcpy(temp + i * temp_step + radius * size, src_data + x * src_step, width * size);
+        }
+    }
+
+    std::vector<float> _space_weight(d*d);
+    std::vector<int> _space_ofs(d*d);
+    float* space_weight = _space_weight.data();
+    int* space_ofs = _space_ofs.data();
+    int maxk = 0;
+    for (int i = -radius; i <= radius; i++)
+    {
+        for (int j = -radius; j <= radius; j++)
+        {
+            double r = std::sqrt((double)i*i + (double)j*j);
+            if (r <= radius && (depth == CV_8U || i != 0 || j != 0))
+            {
+                space_weight[maxk] = static_cast<float>(r*r*gauss_space_coeff);
+                space_ofs[maxk++] = (i * (temp_step / size) + j) * cn;
+            }
+        }
+    }
+    cv::rvv_hal::core::exp32f(space_weight, space_weight, maxk);
+
+    if (depth == CV_8U)
+    {
+        std::vector<float> _color_weight(cn*256);
+        float* color_weight = _color_weight.data();
+        for (int i = 0; i < 256*cn; i++)
+            color_weight[i] = static_cast<float>(i*i*gauss_color_coeff);
+        cv::rvv_hal::core::exp32f(color_weight, color_weight, 256*cn);
+
+        switch (cn)
+        {
+        case 1:
+            return common::invoke(height, {bilateralFilter8UC1}, temp, temp_step, dst_data, dst_step, width, radius, maxk, space_ofs, space_weight, color_weight);
+        case 3:
+            return common::invoke(height, {bilateralFilter8UC3}, temp, temp_step, dst_data, dst_step, width, radius, maxk, space_ofs, space_weight, color_weight);
+        }
+    }
+    else
+    {
+        double minValSrc = -1, maxValSrc = 1;
+        cv::rvv_hal::core::minMaxIdx(src_data, src_step, width * cn, height, CV_32F, &minValSrc, &maxValSrc, nullptr, nullptr, nullptr);
+        if(std::abs(minValSrc - maxValSrc) < FLT_EPSILON)
+        {
+            for (int i = 0; i < width; i++)
+                memcpy(dst_data + i * dst_step, src_data + i * src_step, width * size);
+            return CV_HAL_ERROR_OK;
+        }
+
+        const int kExpNumBinsPerChannel = 1 << 12;
+        const int kExpNumBins = kExpNumBinsPerChannel * cn;
+        const float scale_index = kExpNumBins / static_cast<float>((maxValSrc - minValSrc) * cn);
+        std::vector<float> _expLUT(kExpNumBins+2, 0);
+        float* expLUT = _expLUT.data();
+        for (int i = 0; i < kExpNumBins+2; i++)
+        {
+            double val = i / scale_index;
+            expLUT[i] = static_cast<float>(val * val * gauss_color_coeff);
+        }
+        cv::rvv_hal::core::exp32f(expLUT, expLUT, kExpNumBins+2);
+
+        switch (cn)
+        {
+        case 1:
+            return common::invoke(height, {bilateralFilter32FC1}, temp, temp_step, dst_data, dst_step, width, radius, maxk, space_ofs, space_weight, expLUT, scale_index);
+        case 3:
+            return common::invoke(height, {bilateralFilter32FC3}, temp, temp_step, dst_data, dst_step, width, radius, maxk, space_ofs, space_weight, expLUT, scale_index);
+        }
+    }
+
+    return CV_HAL_ERROR_NOT_IMPLEMENTED;
+}
+
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+}}} // cv::rvv_hal::imgproc
diff --git a/hal/riscv-rvv/src/imgproc/box_filter.cpp b/hal/riscv-rvv/src/imgproc/box_filter.cpp
new file mode 100644
index 0000000000..8a91ef57bb
--- /dev/null
+++ b/hal/riscv-rvv/src/imgproc/box_filter.cpp
@@ -0,0 +1,392 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
+
+#include "rvv_hal.hpp"
+#include "common.hpp"
+
+namespace cv { namespace rvv_hal { namespace imgproc {
+
+#if CV_HAL_RVV_1P0_ENABLED
+
+namespace {
+
+template<typename T> struct rvv;
+template<> struct rvv<uchar>
+{
+    static inline vuint16m8_t vcvt0(vuint8m4_t a, size_t b) { return __riscv_vzext_vf2(a, b); }
+    static inline vuint8m4_t vcvt1(vuint16m8_t a, size_t b) { return __riscv_vnclipu(a, 0, __RISCV_VXRM_RNU, b); }
+    static inline vuint16m8_t vdiv(vuint16m8_t a, ushort b, size_t c) { return __riscv_vdivu(__riscv_vadd(a, b / 2, c), b, c); }
+};
+template<> struct rvv<short>
+{
+    static inline vint32m8_t vcvt0(vint16m4_t a, size_t b) { return __riscv_vsext_vf2(a, b); }
+    static inline vint16m4_t vcvt1(vint32m8_t a, size_t b) { return __riscv_vnclip(a, 0, __RISCV_VXRM_RNU, b); }
+    static inline vint32m8_t vdiv(vint32m8_t a, int b, size_t c) { return __riscv_vdiv(__riscv_vadd(a, b / 2, c), b, c); }
+};
+template<> struct rvv<int>
+{
+    static inline vint32m8_t vcvt0(vint32m8_t a, size_t) { return a; }
+    static inline vint32m8_t vcvt1(vint32m8_t a, size_t) { return a; }
+    static inline vint32m8_t vdiv(vint32m8_t a, int b, size_t c) { return __riscv_vdiv(__riscv_vadd(a, b / 2, c), b, c); }
+};
+template<> struct rvv<float>
+{
+    static inline vfloat32m8_t vcvt0(vfloat32m8_t a, size_t) { return a; }
+    static inline vfloat32m8_t vcvt1(vfloat32m8_t a, size_t) { return a; }
+    static inline vfloat32m8_t vdiv(vfloat32m8_t a, float b, size_t c) { return __riscv_vfdiv(a, b, c); }
+};
+
+// the algorithm is same as cv_hal_sepFilter
+template<int ksize, typename helperT, typename helperWT, bool cast>
+static inline int boxFilterC1(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int full_width, int full_height, int offset_x, int offset_y, int anchor_x, int anchor_y, bool normalize, int border_type)
+{
+    using T = typename helperT::ElemType;
+    using WT = typename helperWT::ElemType;
+
+    constexpr int noval = std::numeric_limits<int>::max();
+    auto accessX = [&](int x) {
+        int pi = common::borderInterpolate(offset_y + x - anchor_y, full_height, border_type);
+        return pi < 0 ? noval : pi - offset_y;
+    };
+    auto accessY = [&](int y) {
+        int pj = common::borderInterpolate(offset_x + y - anchor_x, full_width, border_type);
+        return pj < 0 ? noval : pj - offset_x;
+    };
+    auto p2idx = [&](int x, int y){ return (x + ksize) % ksize * width + y; };
+
+    std::vector<WT> res(width * ksize);
+    auto process = [&](int x, int y) {
+        WT sum = 0;
+        for (int i = 0; i < ksize; i++)
+        {
+            int p = accessY(y + i);
+            if (p != noval)
+            {
+                sum += reinterpret_cast<const T*>(src_data + x * src_step)[p];
+            }
+        }
+        res[p2idx(x, y)] = sum;
+    };
+
+    const int left = anchor_x, right = width - (ksize - 1 - anchor_x);
+    for (int i = start - anchor_y; i < end + (ksize - 1 - anchor_y); i++)
+    {
+        if (i + offset_y >= 0 && i + offset_y < full_height)
+        {
+            if (left >= right)
+            {
+                for (int j = 0; j < width; j++)
+                    process(i, j);
+            }
+            else
+            {
+                for (int j = 0; j < left; j++)
+                    process(i, j);
+                for (int j = right; j < width; j++)
+                    process(i, j);
+
+                int vl;
+                for (int j = left; j < right; j += vl)
+                {
+                    vl = helperT::setvl(right - j);
+                    const T* extra = reinterpret_cast<const T*>(src_data + i * src_step) + j - anchor_x;
+                    auto src = rvv<T>::vcvt0(helperT::vload(extra, vl), vl);
+
+                    extra += vl;
+                    auto sum = src;
+                    src = helperWT::vslide1down(src, extra[0], vl);
+                    sum = helperWT::vadd(sum, src, vl);
+                    src = helperWT::vslide1down(src, extra[1], vl);
+                    sum = helperWT::vadd(sum, src, vl);
+                    if (ksize == 5)
+                    {
+                        src = helperWT::vslide1down(src, extra[2], vl);
+                        sum = helperWT::vadd(sum, src, vl);
+                        src = helperWT::vslide1down(src, extra[3], vl);
+                        sum = helperWT::vadd(sum, src, vl);
+                    }
+                    helperWT::vstore(res.data() + p2idx(i, j), sum, vl);
+                }
+            }
+        }
+
+        int cur = i - (ksize - 1 - anchor_y);
+        if (cur >= start)
+        {
+            const WT* row0 = accessX(cur    ) == noval ? nullptr : res.data() + p2idx(accessX(cur    ), 0);
+            const WT* row1 = accessX(cur + 1) == noval ? nullptr : res.data() + p2idx(accessX(cur + 1), 0);
+            const WT* row2 = accessX(cur + 2) == noval ? nullptr : res.data() + p2idx(accessX(cur + 2), 0);
+            const WT* row3 = nullptr, *row4 = nullptr;
+            if (ksize == 5)
+            {
+                row3 = accessX(cur + 3) == noval ? nullptr : res.data() + p2idx(accessX(cur + 3), 0);
+                row4 = accessX(cur + 4) == noval ? nullptr : res.data() + p2idx(accessX(cur + 4), 0);
+            }
+
+            int vl;
+            for (int j = 0; j < width; j += vl)
+            {
+                vl = helperWT::setvl(width - j);
+                auto sum = row0 ? helperWT::vload(row0 + j, vl) : helperWT::vmv(0, vl);
+                if (row1) sum = helperWT::vadd(sum, helperWT::vload(row1 + j, vl), vl);
+                if (row2) sum = helperWT::vadd(sum, helperWT::vload(row2 + j, vl), vl);
+                if (row3) sum = helperWT::vadd(sum, helperWT::vload(row3 + j, vl), vl);
+                if (row4) sum = helperWT::vadd(sum, helperWT::vload(row4 + j, vl), vl);
+                if (normalize) sum = rvv<T>::vdiv(sum, ksize * ksize, vl);
+
+                if (cast)
+                {
+                    helperT::vstore(reinterpret_cast<T*>(dst_data + cur * dst_step) + j, rvv<T>::vcvt1(sum, vl), vl);
+                }
+                else
+                {
+                    helperWT::vstore(reinterpret_cast<WT*>(dst_data + cur * dst_step) + j, sum, vl);
+                }
+            }
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+template<int ksize>
+static inline int boxFilterC3(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int full_width, int full_height, int offset_x, int offset_y, int anchor_x, int anchor_y, bool normalize, int border_type)
+{
+    constexpr int noval = std::numeric_limits<int>::max();
+    auto accessX = [&](int x) {
+        int pi = common::borderInterpolate(offset_y + x - anchor_y, full_height, border_type);
+        return pi < 0 ? noval : pi - offset_y;
+    };
+    auto accessY = [&](int y) {
+        int pj = common::borderInterpolate(offset_x + y - anchor_x, full_width, border_type);
+        return pj < 0 ? noval : pj - offset_x;
+    };
+    auto p2idx = [&](int x, int y){ return ((x + ksize) % ksize * width + y) * 3; };
+
+    std::vector<float> res(width * ksize * 3);
+    auto process = [&](int x, int y) {
+        float sum0, sum1, sum2;
+        sum0 = sum1 = sum2 = 0;
+        for (int i = 0; i < ksize; i++)
+        {
+            int p = accessY(y + i);
+            if (p != noval)
+            {
+                sum0 += reinterpret_cast<const float*>(src_data + x * src_step)[p * 3    ];
+                sum1 += reinterpret_cast<const float*>(src_data + x * src_step)[p * 3 + 1];
+                sum2 += reinterpret_cast<const float*>(src_data + x * src_step)[p * 3 + 2];
+            }
+        }
+        res[p2idx(x, y)    ] = sum0;
+        res[p2idx(x, y) + 1] = sum1;
+        res[p2idx(x, y) + 2] = sum2;
+    };
+
+    const int left = anchor_x, right = width - (ksize - 1 - anchor_x);
+    for (int i = start - anchor_y; i < end + (ksize - 1 - anchor_y); i++)
+    {
+        if (i + offset_y >= 0 && i + offset_y < full_height)
+        {
+            if (left >= right)
+            {
+                for (int j = 0; j < width; j++)
+                    process(i, j);
+            }
+            else
+            {
+                for (int j = 0; j < left; j++)
+                    process(i, j);
+                for (int j = right; j < width; j++)
+                    process(i, j);
+
+                int vl;
+                for (int j = left; j < right; j += vl)
+                {
+                    vl = __riscv_vsetvl_e32m2(right - j);
+                    const float* extra = reinterpret_cast<const float*>(src_data + i * src_step) + (j - anchor_x) * 3;
+                    auto src = __riscv_vlseg3e32_v_f32m2x3(extra, vl);
+                    auto src0 = __riscv_vget_v_f32m2x3_f32m2(src, 0);
+                    auto src1 = __riscv_vget_v_f32m2x3_f32m2(src, 1);
+                    auto src2 = __riscv_vget_v_f32m2x3_f32m2(src, 2);
+
+                    extra += vl * 3;
+                    auto sum0 = src0, sum1 = src1, sum2 = src2;
+                    src0 = __riscv_vfslide1down(src0, extra[0], vl);
+                    src1 = __riscv_vfslide1down(src1, extra[1], vl);
+                    src2 = __riscv_vfslide1down(src2, extra[2], vl);
+                    sum0 = __riscv_vfadd(sum0, src0, vl);
+                    sum1 = __riscv_vfadd(sum1, src1, vl);
+                    sum2 = __riscv_vfadd(sum2, src2, vl);
+                    src0 = __riscv_vfslide1down(src0, extra[3], vl);
+                    src1 = __riscv_vfslide1down(src1, extra[4], vl);
+                    src2 = __riscv_vfslide1down(src2, extra[5], vl);
+                    sum0 = __riscv_vfadd(sum0, src0, vl);
+                    sum1 = __riscv_vfadd(sum1, src1, vl);
+                    sum2 = __riscv_vfadd(sum2, src2, vl);
+                    if (ksize == 5)
+                    {
+                        src0 = __riscv_vfslide1down(src0, extra[6], vl);
+                        src1 = __riscv_vfslide1down(src1, extra[7], vl);
+                        src2 = __riscv_vfslide1down(src2, extra[8], vl);
+                        sum0 = __riscv_vfadd(sum0, src0, vl);
+                        sum1 = __riscv_vfadd(sum1, src1, vl);
+                        sum2 = __riscv_vfadd(sum2, src2, vl);
+                        src0 = __riscv_vfslide1down(src0, extra[ 9], vl);
+                        src1 = __riscv_vfslide1down(src1, extra[10], vl);
+                        src2 = __riscv_vfslide1down(src2, extra[11], vl);
+                        sum0 = __riscv_vfadd(sum0, src0, vl);
+                        sum1 = __riscv_vfadd(sum1, src1, vl);
+                        sum2 = __riscv_vfadd(sum2, src2, vl);
+                    }
+
+                    vfloat32m2x3_t dst{};
+                    dst = __riscv_vset_v_f32m2_f32m2x3(dst, 0, sum0);
+                    dst = __riscv_vset_v_f32m2_f32m2x3(dst, 1, sum1);
+                    dst = __riscv_vset_v_f32m2_f32m2x3(dst, 2, sum2);
+                    __riscv_vsseg3e32(res.data() + p2idx(i, j), dst, vl);
+                }
+            }
+        }
+
+        int cur = i - (ksize - 1 - anchor_y);
+        if (cur >= start)
+        {
+            const float* row0 = accessX(cur    ) == noval ? nullptr : res.data() + p2idx(accessX(cur    ), 0);
+            const float* row1 = accessX(cur + 1) == noval ? nullptr : res.data() + p2idx(accessX(cur + 1), 0);
+            const float* row2 = accessX(cur + 2) == noval ? nullptr : res.data() + p2idx(accessX(cur + 2), 0);
+            const float* row3 = nullptr, *row4 = nullptr;
+            if (ksize == 5)
+            {
+                row3 = accessX(cur + 3) == noval ? nullptr : res.data() + p2idx(accessX(cur + 3), 0);
+                row4 = accessX(cur + 4) == noval ? nullptr : res.data() + p2idx(accessX(cur + 4), 0);
+            }
+
+            int vl;
+            for (int j = 0; j < width; j += vl)
+            {
+                vl = __riscv_vsetvl_e32m2(width - j);
+                vfloat32m2_t sum0, sum1, sum2;
+                sum0 = sum1 = sum2 = __riscv_vfmv_v_f_f32m2(0, vl);
+                auto loadres = [&](const float* row) {
+                    if (!row) return;
+                    auto src = __riscv_vlseg3e32_v_f32m2x3(row + j * 3, vl);
+                    sum0 = __riscv_vfadd(sum0, __riscv_vget_v_f32m2x3_f32m2(src, 0), vl);
+                    sum1 = __riscv_vfadd(sum1, __riscv_vget_v_f32m2x3_f32m2(src, 1), vl);
+                    sum2 = __riscv_vfadd(sum2, __riscv_vget_v_f32m2x3_f32m2(src, 2), vl);
+                };
+                loadres(row0);
+                loadres(row1);
+                loadres(row2);
+                loadres(row3);
+                loadres(row4);
+                if (normalize)
+                {
+                    sum0 = __riscv_vfdiv(sum0, ksize * ksize, vl);
+                    sum1 = __riscv_vfdiv(sum1, ksize * ksize, vl);
+                    sum2 = __riscv_vfdiv(sum2, ksize * ksize, vl);
+                }
+
+                vfloat32m2x3_t dst{};
+                dst = __riscv_vset_v_f32m2_f32m2x3(dst, 0, sum0);
+                dst = __riscv_vset_v_f32m2_f32m2x3(dst, 1, sum1);
+                dst = __riscv_vset_v_f32m2_f32m2x3(dst, 2, sum2);
+                __riscv_vsseg3e32(reinterpret_cast<float*>(dst_data + cur * dst_step) + j * 3, dst, vl);
+            }
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+} // anonymous
+
+int boxFilter(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int src_depth, int dst_depth, int cn, int margin_left, int margin_top, int margin_right, int margin_bottom, size_t ksize_width, size_t ksize_height, int anchor_x, int anchor_y, bool normalize, int border_type)
+{
+    const int src_type = CV_MAKETYPE(src_depth, cn), dst_type = CV_MAKETYPE(dst_depth, cn);
+    if (ksize_width != ksize_height || (ksize_width != 3 && ksize_width != 5))
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    if (border_type & BORDER_ISOLATED || border_type == BORDER_WRAP)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    uchar* _dst_data = dst_data;
+    size_t _dst_step = dst_step;
+    const size_t size = CV_ELEM_SIZE(dst_type);
+    std::vector<uchar> dst;
+    if (src_data == _dst_data)
+    {
+        dst = std::vector<uchar>(width * height * size);
+        dst_data = dst.data();
+        dst_step = width * size;
+    }
+
+    int res = CV_HAL_ERROR_NOT_IMPLEMENTED;
+    anchor_x = anchor_x < 0 ? ksize_width  / 2 : anchor_x;
+    anchor_y = anchor_y < 0 ? ksize_height / 2 : anchor_y;
+    if (src_type != dst_type)
+    {
+        if (src_type == CV_8UC1 && dst_type == CV_16UC1)
+        {
+            if (ksize_width == 3)
+            {
+                res = common::invoke(height, {boxFilterC1<3, RVV_U8M4, RVV_U16M8, false>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type);
+            }
+            if (ksize_width == 5)
+            {
+                res = common::invoke(height, {boxFilterC1<5, RVV_U8M4, RVV_U16M8, false>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type);
+            }
+        }
+    }
+    else
+    {
+        switch (ksize_width*100 + src_type)
+        {
+        case 300 + CV_8UC1:
+            res = common::invoke(height, {boxFilterC1<3, RVV_U8M4, RVV_U16M8, true>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type);
+            break;
+        case 500 + CV_8UC1:
+            res = common::invoke(height, {boxFilterC1<5, RVV_U8M4, RVV_U16M8, true>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type);
+            break;
+        case 300 + CV_16SC1:
+            res = common::invoke(height, {boxFilterC1<3, RVV_I16M4, RVV_I32M8, true>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type);
+            break;
+        case 500 + CV_16SC1:
+            res = common::invoke(height, {boxFilterC1<5, RVV_I16M4, RVV_I32M8, true>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type);
+            break;
+        case 300 + CV_32SC1:
+            res = common::invoke(height, {boxFilterC1<3, RVV_I32M8, RVV_I32M8, true>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type);
+            break;
+        case 500 + CV_32SC1:
+            res = common::invoke(height, {boxFilterC1<5, RVV_I32M8, RVV_I32M8, true>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type);
+            break;
+        case 300 + CV_32FC1:
+            res = common::invoke(height, {boxFilterC1<3, RVV_F32M8, RVV_F32M8, true>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type);
+            break;
+        case 500 + CV_32FC1:
+            res = common::invoke(height, {boxFilterC1<5, RVV_F32M8, RVV_F32M8, true>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type);
+            break;
+        case 300 + CV_32FC3:
+            res = common::invoke(height, {boxFilterC3<3>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type);
+            break;
+        case 500 + CV_32FC3:
+            res = common::invoke(height, {boxFilterC3<5>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type);
+            break;
+        }
+    }
+    if (res == CV_HAL_ERROR_NOT_IMPLEMENTED)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    if (src_data == _dst_data)
+    {
+        for (int i = 0; i < height; i++)
+            memcpy(_dst_data + i * _dst_step, dst.data() + i * dst_step, dst_step);
+    }
+
+    return res;
+}
+
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+}}} // cv::rvv_hal::imgproc
diff --git a/hal/riscv-rvv/hal_rvv_1p0/color.hpp b/hal/riscv-rvv/src/imgproc/color.cpp
similarity index 90%
rename from hal/riscv-rvv/hal_rvv_1p0/color.hpp
rename to hal/riscv-rvv/src/imgproc/color.cpp
index c715c6ad38..1b7ee0a4d3 100644
--- a/hal/riscv-rvv/hal_rvv_1p0/color.hpp
+++ b/hal/riscv-rvv/src/imgproc/color.cpp
@@ -4,12 +4,12 @@
 
 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
 
-#ifndef OPENCV_HAL_RVV_COLOR_HPP_INCLUDED
-#define OPENCV_HAL_RVV_COLOR_HPP_INCLUDED
+#include "rvv_hal.hpp"
+#include <limits>
 
-#include <riscv_vector.h>
+namespace cv { namespace rvv_hal { namespace imgproc {
 
-namespace cv { namespace cv_hal_rvv {
+#if CV_HAL_RVV_1P0_ENABLED
 
 namespace color {
     class ColorInvoker : public ParallelLoopBody
@@ -41,11 +41,9 @@ namespace color {
     {
         return val - std::remainder(val, 1.0);
     }
-} // cv::cv_hal_rvv::color
+} // cv::rvv_hal::color
 
 namespace BGRtoBGR {
-#undef cv_hal_cvtBGRtoBGR
-#define cv_hal_cvtBGRtoBGR cv::cv_hal_rvv::BGRtoBGR::cvtBGRtoBGR
 
 template<typename T> struct rvv;
 template<> struct rvv<uchar>
@@ -206,27 +204,26 @@ static inline int cvtBGRtoBGR(int start, int end, const T * src, size_t src_step
     return CV_HAL_ERROR_OK;
 }
 
-inline int cvtBGRtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, int dcn, bool swapBlue)
+} // BGRtoBGR
+
+int cvtBGRtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, int dcn, bool swapBlue)
 {
     if ((scn != 3 && scn != 4) || (dcn != 3 && dcn != 4))
         return CV_HAL_ERROR_NOT_IMPLEMENTED;
     switch (depth)
     {
     case CV_8U:
-        return cvtBGRtoBGR<uchar>(0, height, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, scn, dcn, swapBlue);
+        return BGRtoBGR::cvtBGRtoBGR<uchar>(0, height, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, scn, dcn, swapBlue);
     case CV_16U:
-        return cvtBGRtoBGR<ushort>(0, height, reinterpret_cast<const ushort*>(src_data), src_step, reinterpret_cast<ushort*>(dst_data), dst_step, width, scn, dcn, swapBlue);
+        return BGRtoBGR::cvtBGRtoBGR<ushort>(0, height, reinterpret_cast<const ushort*>(src_data), src_step, reinterpret_cast<ushort*>(dst_data), dst_step, width, scn, dcn, swapBlue);
     case CV_32F:
-        return cvtBGRtoBGR<float>(0, height, reinterpret_cast<const float*>(src_data), src_step, reinterpret_cast<float*>(dst_data), dst_step, width, scn, dcn, swapBlue);
+        return BGRtoBGR::cvtBGRtoBGR<float>(0, height, reinterpret_cast<const float*>(src_data), src_step, reinterpret_cast<float*>(dst_data), dst_step, width, scn, dcn, swapBlue);
     }
 
     return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
-} // cv::cv_hal_rvv::BGRtoBGR
 
 namespace GraytoBGR {
-#undef cv_hal_cvtGraytoBGR
-#define cv_hal_cvtGraytoBGR cv::cv_hal_rvv::GraytoBGR::cvtGraytoBGR
 
 template<typename T> struct rvv;
 template<> struct rvv<uchar>
@@ -337,27 +334,26 @@ static inline int cvtGraytoBGR(int start, int end, const T * src, size_t src_ste
     return CV_HAL_ERROR_OK;
 }
 
-inline int cvtGraytoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int dcn)
+} // GraytoBGR
+
+int cvtGraytoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int dcn)
 {
     if (dcn != 3 && dcn != 4)
         return CV_HAL_ERROR_NOT_IMPLEMENTED;
     switch (depth)
     {
     case CV_8U:
-        return cvtGraytoBGR<uchar>(0, height, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, dcn);
+        return GraytoBGR::cvtGraytoBGR<uchar>(0, height, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, dcn);
     case CV_16U:
-        return cvtGraytoBGR<ushort>(0, height, reinterpret_cast<const ushort*>(src_data), src_step, reinterpret_cast<ushort*>(dst_data), dst_step, width, dcn);
+        return GraytoBGR::cvtGraytoBGR<ushort>(0, height, reinterpret_cast<const ushort*>(src_data), src_step, reinterpret_cast<ushort*>(dst_data), dst_step, width, dcn);
     case CV_32F:
-        return cvtGraytoBGR<float>(0, height, reinterpret_cast<const float*>(src_data), src_step, reinterpret_cast<float*>(dst_data), dst_step, width, dcn);
+        return GraytoBGR::cvtGraytoBGR<float>(0, height, reinterpret_cast<const float*>(src_data), src_step, reinterpret_cast<float*>(dst_data), dst_step, width, dcn);
     }
 
     return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
-} // cv::cv_hal_rvv::GraytoBGR
 
 namespace BGRtoGray {
-#undef cv_hal_cvtBGRtoGray
-#define cv_hal_cvtBGRtoGray cv::cv_hal_rvv::BGRtoGray::cvtBGRtoGray
 
 template<typename T> struct rvv;
 template<> struct rvv<uchar>
@@ -462,27 +458,26 @@ static inline int cvtBGRtoGray(int start, int end, const T * src, size_t src_ste
     return CV_HAL_ERROR_OK;
 }
 
-inline int cvtBGRtoGray(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue)
+} // BGRtoGray
+
+int cvtBGRtoGray(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue)
 {
     if (scn != 3 && scn != 4)
         return CV_HAL_ERROR_NOT_IMPLEMENTED;
     switch (depth)
     {
     case CV_8U:
-        return color::invoke(width, height, {cvtBGRtoGray<uchar>}, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, scn, swapBlue);
+        return color::invoke(width, height, {BGRtoGray::cvtBGRtoGray<uchar>}, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, scn, swapBlue);
     case CV_16U:
-        return color::invoke(width, height, {cvtBGRtoGray<ushort>}, reinterpret_cast<const ushort*>(src_data), src_step, reinterpret_cast<ushort*>(dst_data), dst_step, width, scn, swapBlue);
+        return color::invoke(width, height, {BGRtoGray::cvtBGRtoGray<ushort>}, reinterpret_cast<const ushort*>(src_data), src_step, reinterpret_cast<ushort*>(dst_data), dst_step, width, scn, swapBlue);
     case CV_32F:
-        return color::invoke(width, height, {cvtBGRtoGray<float>}, reinterpret_cast<const float*>(src_data), src_step, reinterpret_cast<float*>(dst_data), dst_step, width, scn, swapBlue);
+        return color::invoke(width, height, {BGRtoGray::cvtBGRtoGray<float>}, reinterpret_cast<const float*>(src_data), src_step, reinterpret_cast<float*>(dst_data), dst_step, width, scn, swapBlue);
     }
 
     return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
-} // cv::cv_hal_rvv::BGRtoGray
 
 namespace BGR5x5toBGR {
-#undef cv_hal_cvtBGR5x5toBGR
-#define cv_hal_cvtBGR5x5toBGR cv::cv_hal_rvv::BGR5x5toBGR::cvtBGR5x5toBGR
 
 // the algorithm is copied from imgproc/src/color_rgb.simd.cpp,
 // in the functor struct RGB5x52RGB
@@ -540,18 +535,17 @@ static inline int cvtBGR5x5toBGR_u(int start, int end, const ushort * src, size_
     return CV_HAL_ERROR_OK;
 }
 
-inline int cvtBGR5x5toBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int dcn, bool swapBlue, int greenBits)
+} // BGR5x5toBGR
+
+int cvtBGR5x5toBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int dcn, bool swapBlue, int greenBits)
 {
     if ((dcn != 3 && dcn != 4) || (greenBits != 5 && greenBits != 6))
         return CV_HAL_ERROR_NOT_IMPLEMENTED;
 
-    return color::invoke(width, height, {cvtBGR5x5toBGR_u}, reinterpret_cast<const ushort*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, dcn, swapBlue, greenBits);
+    return color::invoke(width, height, {BGR5x5toBGR::cvtBGR5x5toBGR_u}, reinterpret_cast<const ushort*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, dcn, swapBlue, greenBits);
 }
-} // cv::cv_hal_rvv::BGR5x5toBGR
 
 namespace BGRtoBGR5x5 {
-#undef cv_hal_cvtBGRtoBGR5x5
-#define cv_hal_cvtBGRtoBGR5x5 cv::cv_hal_rvv::BGRtoBGR5x5::cvtBGRtoBGR5x5
 
 // the algorithm is copied from imgproc/src/color_rgb.simd.cpp,
 // in the functor struct RGB2RGB5x5
@@ -604,18 +598,17 @@ static inline int cvtBGRtoBGR5x5_u(int start, int end, const uchar * src, size_t
     return CV_HAL_ERROR_OK;
 }
 
-inline int cvtBGRtoBGR5x5(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int scn, bool swapBlue, int greenBits)
+} // BGRtoBGR5x5
+
+int cvtBGRtoBGR5x5(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int scn, bool swapBlue, int greenBits)
 {
     if ((scn != 3 && scn != 4) || (greenBits != 5 && greenBits != 6))
         return CV_HAL_ERROR_NOT_IMPLEMENTED;
 
-    return color::invoke(width, height, {cvtBGRtoBGR5x5_u}, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<ushort*>(dst_data), dst_step, width, scn, swapBlue, greenBits);
+    return color::invoke(width, height, {BGRtoBGR5x5::cvtBGRtoBGR5x5_u}, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<ushort*>(dst_data), dst_step, width, scn, swapBlue, greenBits);
 }
-} // cv::cv_hal_rvv::BGRtoBGR5x5
 
 namespace BGR5x5toGray {
-#undef cv_hal_cvtBGR5x5toGray
-#define cv_hal_cvtBGR5x5toGray cv::cv_hal_rvv::BGR5x5toGray::cvtBGR5x5toGray
 
 // the algorithm is copied from imgproc/src/color_rgb.simd.cpp,
 // in the functor struct RGB5x52Gray
@@ -654,18 +647,17 @@ static inline int cvtBGR5x5toGray_u(int start, int end, const ushort * src, size
     return CV_HAL_ERROR_OK;
 }
 
-inline int cvtBGR5x5toGray(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int greenBits)
+} // BGR5x5toGray
+
+int cvtBGR5x5toGray(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int greenBits)
 {
     if (greenBits != 5 && greenBits != 6)
         return CV_HAL_ERROR_NOT_IMPLEMENTED;
 
-    return color::invoke(width, height, {cvtBGR5x5toGray_u}, reinterpret_cast<const ushort*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, greenBits);
+    return color::invoke(width, height, {BGR5x5toGray::cvtBGR5x5toGray_u}, reinterpret_cast<const ushort*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, greenBits);
 }
-} // cv::cv_hal_rvv::BGR5x5toGray
 
 namespace GraytoBGR5x5 {
-#undef cv_hal_cvtGraytoBGR5x5
-#define cv_hal_cvtGraytoBGR5x5 cv::cv_hal_rvv::GraytoBGR5x5::cvtGraytoBGR5x5
 
 // the algorithm is copied from imgproc/src/color_rgb.simd.cpp,
 // in the functor struct Gray2RGB5x5
@@ -697,18 +689,17 @@ static inline int cvtGraytoBGR5x5_u(int start, int end, const uchar * src, size_
     return CV_HAL_ERROR_OK;
 }
 
-inline int cvtGraytoBGR5x5(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int greenBits)
+} // GraytoBGR5x5
+
+int cvtGraytoBGR5x5(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int greenBits)
 {
     if (greenBits != 5 && greenBits != 6)
         return CV_HAL_ERROR_NOT_IMPLEMENTED;
 
-    return color::invoke(width, height, {cvtGraytoBGR5x5_u}, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<ushort*>(dst_data), dst_step, width, greenBits);
+    return color::invoke(width, height, {GraytoBGR5x5::cvtGraytoBGR5x5_u}, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<ushort*>(dst_data), dst_step, width, greenBits);
 }
-} // cv::cv_hal_rvv::GraytoBGR5x5
 
 namespace YUVtoBGR {
-#undef cv_hal_cvtYUVtoBGR
-#define cv_hal_cvtYUVtoBGR cv::cv_hal_rvv::YUVtoBGR::cvtYUVtoBGR
 
 template<typename T> struct rvv;
 template<> struct rvv<uchar>
@@ -857,27 +848,26 @@ static inline int cvtYUVtoBGR(int start, int end, const T * src, size_t src_step
     return CV_HAL_ERROR_OK;
 }
 
-inline int cvtYUVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int dcn, bool swapBlue, bool isCbCr)
+} // YUVtoBGR
+
+int cvtYUVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int dcn, bool swapBlue, bool isCbCr)
 {
     if (dcn != 3 && dcn != 4)
         return CV_HAL_ERROR_NOT_IMPLEMENTED;
     switch (depth)
     {
     case CV_8U:
-        return color::invoke(width, height, {cvtYUVtoBGR<uchar>}, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, dcn, swapBlue, isCbCr);
+        return color::invoke(width, height, {YUVtoBGR::cvtYUVtoBGR<uchar>}, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, dcn, swapBlue, isCbCr);
     case CV_16U:
-        return color::invoke(width, height, {cvtYUVtoBGR<ushort>}, reinterpret_cast<const ushort*>(src_data), src_step, reinterpret_cast<ushort*>(dst_data), dst_step, width, dcn, swapBlue, isCbCr);
+        return color::invoke(width, height, {YUVtoBGR::cvtYUVtoBGR<ushort>}, reinterpret_cast<const ushort*>(src_data), src_step, reinterpret_cast<ushort*>(dst_data), dst_step, width, dcn, swapBlue, isCbCr);
     case CV_32F:
-        return color::invoke(width, height, {cvtYUVtoBGR<float>}, reinterpret_cast<const float*>(src_data), src_step, reinterpret_cast<float*>(dst_data), dst_step, width, dcn, swapBlue, isCbCr);
+        return color::invoke(width, height, {YUVtoBGR::cvtYUVtoBGR<float>}, reinterpret_cast<const float*>(src_data), src_step, reinterpret_cast<float*>(dst_data), dst_step, width, dcn, swapBlue, isCbCr);
     }
 
     return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
-} // cv::cv_hal_rvv::YUVtoBGR
 
 namespace BGRtoYUV {
-#undef cv_hal_cvtBGRtoYUV
-#define cv_hal_cvtBGRtoYUV cv::cv_hal_rvv::BGRtoYUV::cvtBGRtoYUV
 
 template<typename T> struct rvv;
 template<> struct rvv<uchar>
@@ -1027,31 +1017,26 @@ static inline int cvtBGRtoYUV(int start, int end, const T * src, size_t src_step
     return CV_HAL_ERROR_OK;
 }
 
-inline int cvtBGRtoYUV(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue, bool isCbCr)
+} // BGRtoYUV
+
+int cvtBGRtoYUV(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue, bool isCbCr)
 {
     if (scn != 3 && scn != 4)
         return CV_HAL_ERROR_NOT_IMPLEMENTED;
     switch (depth)
     {
     case CV_8U:
-        return color::invoke(width, height, {cvtBGRtoYUV<uchar>}, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, scn, swapBlue, isCbCr);
+        return color::invoke(width, height, {BGRtoYUV::cvtBGRtoYUV<uchar>}, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, scn, swapBlue, isCbCr);
     case CV_16U:
-        return color::invoke(width, height, {cvtBGRtoYUV<ushort>}, reinterpret_cast<const ushort*>(src_data), src_step, reinterpret_cast<ushort*>(dst_data), dst_step, width, scn, swapBlue, isCbCr);
+        return color::invoke(width, height, {BGRtoYUV::cvtBGRtoYUV<ushort>}, reinterpret_cast<const ushort*>(src_data), src_step, reinterpret_cast<ushort*>(dst_data), dst_step, width, scn, swapBlue, isCbCr);
     case CV_32F:
-        return color::invoke(width, height, {cvtBGRtoYUV<float>}, reinterpret_cast<const float*>(src_data), src_step, reinterpret_cast<float*>(dst_data), dst_step, width, scn, swapBlue, isCbCr);
+        return color::invoke(width, height, {BGRtoYUV::cvtBGRtoYUV<float>}, reinterpret_cast<const float*>(src_data), src_step, reinterpret_cast<float*>(dst_data), dst_step, width, scn, swapBlue, isCbCr);
     }
 
     return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
-} // cv::cv_hal_rvv::BGRtoYUV
 
 namespace PlaneYUVtoBGR {
-#undef cv_hal_cvtOnePlaneYUVtoBGR
-#define cv_hal_cvtOnePlaneYUVtoBGR cv::cv_hal_rvv::PlaneYUVtoBGR::cvtOnePlaneYUVtoBGR
-#undef cv_hal_cvtTwoPlaneYUVtoBGR
-#define cv_hal_cvtTwoPlaneYUVtoBGR cv::cv_hal_rvv::PlaneYUVtoBGR::cvtTwoPlaneYUVtoBGR
-#undef cv_hal_cvtThreePlaneYUVtoBGR
-#define cv_hal_cvtThreePlaneYUVtoBGR cv::cv_hal_rvv::PlaneYUVtoBGR::cvtThreePlaneYUVtoBGR
 
 static const int ITUR_BT_601_SHIFT = 20;
 static const int ITUR_BT_601_CY  = 1220542;
@@ -1241,22 +1226,24 @@ static inline int cvtMultiPlaneYUVtoBGR(int start, int end, uchar * dst_data, si
     return CV_HAL_ERROR_OK;
 }
 
-inline int cvtOnePlaneYUVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int dst_width, int dst_height, int dcn, bool swapBlue, int uIdx, int yIdx)
+} // PlaneYUVtoBGR
+
+int cvtOnePlaneYUVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int dst_width, int dst_height, int dcn, bool swapBlue, int uIdx, int yIdx)
 {
     if (dcn != 3 && dcn != 4)
         return CV_HAL_ERROR_NOT_IMPLEMENTED;
-    return color::invoke(dst_width, dst_height, {cvtSinglePlaneYUVtoBGR}, dst_data, dst_step, dst_width, src_step, src_data, dcn, swapBlue, uIdx, yIdx);
+    return color::invoke(dst_width, dst_height, {PlaneYUVtoBGR::cvtSinglePlaneYUVtoBGR}, dst_data, dst_step, dst_width, src_step, src_data, dcn, swapBlue, uIdx, yIdx);
 }
 
-inline int cvtTwoPlaneYUVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int dst_width, int dst_height, int dcn, bool swapBlue, int uIdx)
+int cvtTwoPlaneYUVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int dst_width, int dst_height, int dcn, bool swapBlue, int uIdx)
 {
     if (dcn != 3 && dcn != 4)
         return CV_HAL_ERROR_NOT_IMPLEMENTED;
     const uchar* uv = src_data + src_step * static_cast<size_t>(dst_height);
-    return color::invoke(dst_width, dst_height / 2, {cvtMultiPlaneYUVtoBGR}, dst_data, dst_step, dst_width, src_step, src_data, uv, uv, 0, 0, dcn, swapBlue, uIdx);
+    return color::invoke(dst_width, dst_height / 2, {PlaneYUVtoBGR::cvtMultiPlaneYUVtoBGR}, dst_data, dst_step, dst_width, src_step, src_data, uv, uv, 0, 0, dcn, swapBlue, uIdx);
 }
 
-inline int cvtThreePlaneYUVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int dst_width, int dst_height, int dcn, bool swapBlue, int uIdx)
+int cvtThreePlaneYUVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int dst_width, int dst_height, int dcn, bool swapBlue, int uIdx)
 {
     if (dcn != 3 && dcn != 4)
         return CV_HAL_ERROR_NOT_IMPLEMENTED;
@@ -1267,17 +1254,10 @@ inline int cvtThreePlaneYUVtoBGR(const uchar * src_data, size_t src_step, uchar
     int vstepIdx = dst_height % 4 == 2 ? 1 : 0;
     if (uIdx == 1) { std::swap(u ,v), std::swap(ustepIdx, vstepIdx); }
 
-    return color::invoke(dst_width, dst_height / 2, {cvtMultiPlaneYUVtoBGR}, dst_data, dst_step, dst_width, src_step, src_data, u, v, ustepIdx, vstepIdx, dcn, swapBlue, -1);
+    return color::invoke(dst_width, dst_height / 2, {PlaneYUVtoBGR::cvtMultiPlaneYUVtoBGR}, dst_data, dst_step, dst_width, src_step, src_data, u, v, ustepIdx, vstepIdx, dcn, swapBlue, -1);
 }
-} // cv::cv_hal_rvv::PlaneYUVtoBGR
 
 namespace PlaneBGRtoYUV {
-#undef cv_hal_cvtOnePlaneBGRtoYUV
-#define cv_hal_cvtOnePlaneBGRtoYUV cv::cv_hal_rvv::PlaneBGRtoYUV::cvtOnePlaneBGRtoYUV
-#undef cv_hal_cvtBGRtoTwoPlaneYUV
-#define cv_hal_cvtBGRtoTwoPlaneYUV cv::cv_hal_rvv::PlaneBGRtoYUV::cvtBGRtoTwoPlaneYUV
-#undef cv_hal_cvtBGRtoThreePlaneYUV
-#define cv_hal_cvtBGRtoThreePlaneYUV cv::cv_hal_rvv::PlaneBGRtoYUV::cvtBGRtoThreePlaneYUV
 
 static const int ITUR_BT_601_SHIFT = 20;
 static const int ITUR_BT_601_CBY =  102760; // 0.114035 * (236-16)/256 * (1 << ITUR_BT_601_SHIFT)
@@ -1512,35 +1492,34 @@ static inline int cvtBGRtoMultiPlaneYUV(int start, int end, uchar * yData, uchar
     return CV_HAL_ERROR_OK;
 }
 
-inline int cvtOnePlaneBGRtoYUV(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int scn, bool swapBlue, int uIdx, int yIdx)
+} // PlaneBGRtoYUV
+
+int cvtOnePlaneBGRtoYUV(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int scn, bool swapBlue, int uIdx, int yIdx)
 {
     if (scn != 3 && scn != 4)
         return CV_HAL_ERROR_NOT_IMPLEMENTED;
-    return color::invoke(width, height, {cvtBGRtoSinglePlaneYUV}, dst_data, dst_step, width, src_step, src_data, scn, swapBlue, uIdx, yIdx);
+    return color::invoke(width, height, {PlaneBGRtoYUV::cvtBGRtoSinglePlaneYUV}, dst_data, dst_step, width, src_step, src_data, scn, swapBlue, uIdx, yIdx);
 }
 
-inline int cvtBGRtoTwoPlaneYUV(const uchar * src_data, size_t src_step,
+int cvtBGRtoTwoPlaneYUV(const uchar * src_data, size_t src_step,
                                uchar * y_data, size_t y_step, uchar * uv_data, size_t uv_step,
                                int width, int height,
                                int scn, bool swapBlue, int uIdx)
 {
     if (y_step != uv_step || (scn != 3 && scn != 4))
         return CV_HAL_ERROR_NOT_IMPLEMENTED;
-    return color::invoke(width, height / 2, {cvtBGRtoMultiPlaneYUV}, y_data, uv_data, y_step, width, height, src_step, src_data, scn, swapBlue, uIdx == 2);
+    return color::invoke(width, height / 2, {PlaneBGRtoYUV::cvtBGRtoMultiPlaneYUV}, y_data, uv_data, y_step, width, height, src_step, src_data, scn, swapBlue, uIdx == 2);
 }
 
-inline int cvtBGRtoThreePlaneYUV(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int scn, bool swapBlue, int uIdx)
+int cvtBGRtoThreePlaneYUV(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int scn, bool swapBlue, int uIdx)
 {
     if (scn != 3 && scn != 4)
         return CV_HAL_ERROR_NOT_IMPLEMENTED;
     uchar* uv_data = dst_data + dst_step * static_cast<size_t>(height);
-    return color::invoke(width, height / 2, {cvtBGRtoMultiPlaneYUV}, dst_data, uv_data, dst_step, width, height, src_step, src_data, scn, swapBlue, uIdx == 2 ? 3 : 2);
+    return color::invoke(width, height / 2, {PlaneBGRtoYUV::cvtBGRtoMultiPlaneYUV}, dst_data, uv_data, dst_step, width, height, src_step, src_data, scn, swapBlue, uIdx == 2 ? 3 : 2);
 }
-} // cv::cv_hal_rvv::PlaneBGRtoYUV
 
 namespace HSVtoBGR {
-#undef cv_hal_cvtHSVtoBGR
-#define cv_hal_cvtHSVtoBGR cv::cv_hal_rvv::HSVtoBGR::cvtHSVtoBGR
 
 template<typename T>
 static inline int cvtHSVtoBGR(int start, int end, const T * src, size_t src_step, T * dst, size_t dst_step, int width, int dcn, bool swapBlue, bool isFullRange, bool isHSV);
@@ -1710,25 +1689,24 @@ inline int cvtHSVtoBGR<float>(int start, int end, const float * src, size_t src_
     return CV_HAL_ERROR_OK;
 }
 
-inline int cvtHSVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int dcn, bool swapBlue, bool isFullRange, bool isHSV)
+} // HSVtoBGR
+
+int cvtHSVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int dcn, bool swapBlue, bool isFullRange, bool isHSV)
 {
     if (dcn != 3 && dcn != 4)
         return CV_HAL_ERROR_NOT_IMPLEMENTED;
     switch (depth)
     {
     case CV_8U:
-        return color::invoke(width, height, {cvtHSVtoBGR<uchar>}, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, dcn, swapBlue, isFullRange, isHSV);
+        return color::invoke(width, height, {HSVtoBGR::cvtHSVtoBGR<uchar>}, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, dcn, swapBlue, isFullRange, isHSV);
     case CV_32F:
-        return color::invoke(width, height, {cvtHSVtoBGR<float>}, reinterpret_cast<const float*>(src_data), src_step, reinterpret_cast<float*>(dst_data), dst_step, width, dcn, swapBlue, isFullRange, isHSV);
+        return color::invoke(width, height, {HSVtoBGR::cvtHSVtoBGR<float>}, reinterpret_cast<const float*>(src_data), src_step, reinterpret_cast<float*>(dst_data), dst_step, width, dcn, swapBlue, isFullRange, isHSV);
     }
 
     return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
-} // cv::cv_hal_rvv::HSVtoBGR
 
 namespace BGRtoHSV {
-#undef cv_hal_cvtBGRtoHSV
-#define cv_hal_cvtBGRtoHSV cv::cv_hal_rvv::BGRtoHSV::cvtBGRtoHSV
 
 template<typename T>
 static inline int cvtBGRtoHSV(int start, int end, const T * src, size_t src_step, T * dst, size_t dst_step, int width, int scn, bool swapBlue, bool isFullRange, bool isHSV);
@@ -1870,25 +1848,24 @@ inline int cvtBGRtoHSV<float>(int start, int end, const float * src, size_t src_
     return CV_HAL_ERROR_OK;
 }
 
-inline int cvtBGRtoHSV(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue, bool isFullRange, bool isHSV)
+} // BGRtoHSV
+
+int cvtBGRtoHSV(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue, bool isFullRange, bool isHSV)
 {
     if (scn != 3 && scn != 4)
         return CV_HAL_ERROR_NOT_IMPLEMENTED;
     switch (depth)
     {
     case CV_8U:
-        return color::invoke(width, height, {cvtBGRtoHSV<uchar>}, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, scn, swapBlue, isFullRange, isHSV);
+        return color::invoke(width, height, {BGRtoHSV::cvtBGRtoHSV<uchar>}, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, scn, swapBlue, isFullRange, isHSV);
     case CV_32F:
-        return color::invoke(width, height, {cvtBGRtoHSV<float>}, reinterpret_cast<const float*>(src_data), src_step, reinterpret_cast<float*>(dst_data), dst_step, width, scn, swapBlue, isFullRange, isHSV);
+        return color::invoke(width, height, {BGRtoHSV::cvtBGRtoHSV<float>}, reinterpret_cast<const float*>(src_data), src_step, reinterpret_cast<float*>(dst_data), dst_step, width, scn, swapBlue, isFullRange, isHSV);
     }
 
     return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
-} // cv::cv_hal_rvv::BGRtoHSV
 
 namespace XYZtoBGR {
-#undef cv_hal_cvtXYZtoBGR
-#define cv_hal_cvtXYZtoBGR cv::cv_hal_rvv::XYZtoBGR::cvtXYZtoBGR
 
 template<typename T> struct rvv;
 template<> struct rvv<uchar>
@@ -2042,27 +2019,26 @@ static inline int cvtXYZtoBGR(int start, int end, const T * src, size_t src_step
     return CV_HAL_ERROR_OK;
 }
 
-inline int cvtXYZtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int dcn, bool swapBlue)
+} // XYZtoBGR
+
+int cvtXYZtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int dcn, bool swapBlue)
 {
     if (dcn != 3 && dcn != 4)
         return CV_HAL_ERROR_NOT_IMPLEMENTED;
     switch (depth)
     {
     case CV_8U:
-        return color::invoke(width, height, {cvtXYZtoBGR<uchar>}, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, dcn, swapBlue);
+        return color::invoke(width, height, {XYZtoBGR::cvtXYZtoBGR<uchar>}, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, dcn, swapBlue);
     case CV_16U:
-        return color::invoke(width, height, {cvtXYZtoBGR<ushort>}, reinterpret_cast<const ushort*>(src_data), src_step, reinterpret_cast<ushort*>(dst_data), dst_step, width, dcn, swapBlue);
+        return color::invoke(width, height, {XYZtoBGR::cvtXYZtoBGR<ushort>}, reinterpret_cast<const ushort*>(src_data), src_step, reinterpret_cast<ushort*>(dst_data), dst_step, width, dcn, swapBlue);
     case CV_32F:
-        return color::invoke(width, height, {cvtXYZtoBGR<float>}, reinterpret_cast<const float*>(src_data), src_step, reinterpret_cast<float*>(dst_data), dst_step, width, dcn, swapBlue);
+        return color::invoke(width, height, {XYZtoBGR::cvtXYZtoBGR<float>}, reinterpret_cast<const float*>(src_data), src_step, reinterpret_cast<float*>(dst_data), dst_step, width, dcn, swapBlue);
     }
 
     return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
-} // cv::cv_hal_rvv::XYZtoBGR
 
 namespace BGRtoXYZ {
-#undef cv_hal_cvtBGRtoXYZ
-#define cv_hal_cvtBGRtoXYZ cv::cv_hal_rvv::BGRtoXYZ::cvtBGRtoXYZ
 
 template<typename T> struct rvv;
 template<> struct rvv<uchar>
@@ -2209,23 +2185,24 @@ static inline int cvtBGRtoXYZ(int start, int end, const T * src, size_t src_step
     return CV_HAL_ERROR_OK;
 }
 
-inline int cvtBGRtoXYZ(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue)
+} // BGRtoXYZ
+
+int cvtBGRtoXYZ(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue)
 {
     if (scn != 3 && scn != 4)
         return CV_HAL_ERROR_NOT_IMPLEMENTED;
     switch (depth)
     {
     case CV_8U:
-        return color::invoke(width, height, {cvtBGRtoXYZ<uchar>}, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, scn, swapBlue);
+        return color::invoke(width, height, {BGRtoXYZ::cvtBGRtoXYZ<uchar>}, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, scn, swapBlue);
     case CV_16U:
-        return color::invoke(width, height, {cvtBGRtoXYZ<ushort>}, reinterpret_cast<const ushort*>(src_data), src_step, reinterpret_cast<ushort*>(dst_data), dst_step, width, scn, swapBlue);
+        return color::invoke(width, height, {BGRtoXYZ::cvtBGRtoXYZ<ushort>}, reinterpret_cast<const ushort*>(src_data), src_step, reinterpret_cast<ushort*>(dst_data), dst_step, width, scn, swapBlue);
     case CV_32F:
-        return color::invoke(width, height, {cvtBGRtoXYZ<float>}, reinterpret_cast<const float*>(src_data), src_step, reinterpret_cast<float*>(dst_data), dst_step, width, scn, swapBlue);
+        return color::invoke(width, height, {BGRtoXYZ::cvtBGRtoXYZ<float>}, reinterpret_cast<const float*>(src_data), src_step, reinterpret_cast<float*>(dst_data), dst_step, width, scn, swapBlue);
     }
 
     return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
-} // cv::cv_hal_rvv::BGRtoXYZ
 
 namespace LabTable
 {
@@ -2495,11 +2472,9 @@ namespace LabTable
             return __riscv_vfmadd(__riscv_vfmadd(__riscv_vfmadd(__riscv_vget_v_f32m2x4_f32m2(val, 3), x, __riscv_vget_v_f32m2x4_f32m2(val, 2), vl), x, __riscv_vget_v_f32m2x4_f32m2(val, 1), vl), x, __riscv_vget_v_f32m2x4_f32m2(val, 0), vl);
         }
     };
-} // cv::cv_hal_rvv::LabTable
+} // cv::rvv_hal::imgproc::LabTable
 
 namespace LabtoBGR {
-#undef cv_hal_cvtLabtoBGR
-#define cv_hal_cvtLabtoBGR cv::cv_hal_rvv::LabtoBGR::cvtLabtoBGR
 
 template<typename T>
 static inline int cvtLabtoBGR(int start, int end, const T * src, size_t src_step, T * dst, size_t dst_step, int width, int dcn, bool swapBlue, bool isLab, bool srgb);
@@ -2713,25 +2688,24 @@ inline int cvtLabtoBGR<float>(int start, int end, const float * src, size_t src_
     return CV_HAL_ERROR_OK;
 }
 
-inline int cvtLabtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int dcn, bool swapBlue, bool isLab, bool srgb)
+} // LabtoBGR
+
+int cvtLabtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int dcn, bool swapBlue, bool isLab, bool srgb)
 {
     if (dcn != 3 && dcn != 4)
         return CV_HAL_ERROR_NOT_IMPLEMENTED;
     switch (depth)
     {
     case CV_8U:
-        return color::invoke(width, height, {cvtLabtoBGR<uchar>}, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, dcn, swapBlue, isLab, srgb);
+        return color::invoke(width, height, {LabtoBGR::cvtLabtoBGR<uchar>}, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, dcn, swapBlue, isLab, srgb);
     case CV_32F:
-        return color::invoke(width, height, {cvtLabtoBGR<float>}, reinterpret_cast<const float*>(src_data), src_step, reinterpret_cast<float*>(dst_data), dst_step, width, dcn, swapBlue, isLab, srgb);
+        return color::invoke(width, height, {LabtoBGR::cvtLabtoBGR<float>}, reinterpret_cast<const float*>(src_data), src_step, reinterpret_cast<float*>(dst_data), dst_step, width, dcn, swapBlue, isLab, srgb);
     }
 
     return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
-} // cv::cv_hal_rvv::LabtoBGR
 
 namespace BGRtoLab {
-#undef cv_hal_cvtBGRtoLab
-#define cv_hal_cvtBGRtoLab cv::cv_hal_rvv::BGRtoLab::cvtBGRtoLab
 
 struct rvv_base
 {
@@ -3060,31 +3034,126 @@ static inline int cvtBGRtoLab_f(int start, int end, const float * src, size_t sr
     return CV_HAL_ERROR_OK;
 }
 
-inline int cvtBGRtoLab(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue, bool isLab, bool srgb)
+} // BGRtoLab
+
+int cvtBGRtoLab(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue, bool isLab, bool srgb)
 {
     if (scn != 3 && scn != 4)
         return CV_HAL_ERROR_NOT_IMPLEMENTED;
 
-    auto cvtBGRtoLab_b = cvtBGRtoLab_u<true, true>;
+    auto cvtBGRtoLab_b = BGRtoLab::cvtBGRtoLab_u<true, true>;
     if (!isLab && !srgb)
-        cvtBGRtoLab_b = cvtBGRtoLab_u<false, false>;
+        cvtBGRtoLab_b = BGRtoLab::cvtBGRtoLab_u<false, false>;
     else if (!isLab && srgb)
-        cvtBGRtoLab_b = cvtBGRtoLab_u<false, true>;
+        cvtBGRtoLab_b = BGRtoLab::cvtBGRtoLab_u<false, true>;
     else if (isLab && !srgb)
-        cvtBGRtoLab_b = cvtBGRtoLab_u<true, false>;
+        cvtBGRtoLab_b = BGRtoLab::cvtBGRtoLab_u<true, false>;
 
     switch (depth)
     {
     case CV_8U:
         return color::invoke(width, height, {cvtBGRtoLab_b}, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, scn, swapBlue);
     case CV_32F:
-        return color::invoke(width, height, {cvtBGRtoLab_f}, reinterpret_cast<const float*>(src_data), src_step, reinterpret_cast<float*>(dst_data), dst_step, width, scn, swapBlue, isLab, srgb);
+        return color::invoke(width, height, {BGRtoLab::cvtBGRtoLab_f}, reinterpret_cast<const float*>(src_data), src_step, reinterpret_cast<float*>(dst_data), dst_step, width, scn, swapBlue, isLab, srgb);
     }
 
     return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
-} // cv::cv_hal_rvv::BGRtoLab
 
-}}
+#endif // CV_HAL_RVV_1P0_ENABLED
 
-#endif
+#if CV_HAL_RVV_071_ENABLED
+
+static const unsigned char index_array_32 [32]
+                        { 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15, 18, 17, 16, 19, 22, 21, 20, 23, 26, 25, 24, 27, 30, 29, 28, 31  };
+
+static const unsigned char index_array_24 [24]
+                        { 2, 1, 0, 5, 4, 3, 8, 7, 6, 11, 10, 9, 14, 13, 12, 17, 16, 15, 20, 19, 18, 23, 22, 21  };
+
+static void vBGRtoBGR(const unsigned char* src, unsigned char * dst, const unsigned char * index, int n, int scn, int dcn, int vsize_pixels, const int vsize)
+{
+    vuint8m2_t vec_index = vle8_v_u8m2(index, vsize);
+
+    int i = 0;
+
+    for ( ; i <= n-vsize; i += vsize_pixels, src += vsize, dst += vsize)
+    {
+        vuint8m2_t vec_src = vle8_v_u8m2(src, vsize);
+        vuint8m2_t vec_dst = vrgather_vv_u8m2(vec_src, vec_index, vsize);
+        vse8_v_u8m2(dst, vec_dst, vsize);
+    }
+
+    for ( ; i < n; i++, src += scn, dst += dcn )
+    {
+        unsigned char t0 = src[0], t1 = src[1], t2 = src[2];
+        dst[2] = t0;
+        dst[1] = t1;
+        dst[0] = t2;
+        if(dcn == 4)
+        {
+            unsigned char d = src[3];
+            dst[3] = d;
+        }
+    }
+}
+
+static void sBGRtoBGR(const unsigned char* src, unsigned char * dst, int n, int scn, int dcn, int bi)
+{
+    for (int i = 0; i < n; i++, src += scn, dst += dcn)
+    {
+        unsigned char t0 = src[0], t1 = src[1], t2 = src[2];
+        dst[bi  ] = t0;
+        dst[1]    = t1;
+        dst[bi^2] = t2;
+        if(dcn == 4)
+        {
+            unsigned char d = scn == 4 ? src[3] : std::numeric_limits<unsigned char>::max();
+            dst[3] = d;
+        }
+    }
+}
+
+int cvtBGRtoBGR(const unsigned char * src_data, size_t src_step, unsigned char * dst_data, size_t dst_step, int width, int height, int depth, int scn, int dcn, bool swapBlue)
+{
+    if (depth != CV_8U)
+    {
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    }
+
+    const int blueIdx = swapBlue ? 2 : 0;
+    if (scn == dcn)
+    {
+        if (!swapBlue)
+        {
+            return CV_HAL_ERROR_NOT_IMPLEMENTED;
+        }
+
+        const int vsize_pixels = 8;
+
+        if (scn == 4)
+        {
+            for (int i = 0; i < height; i++, src_data += src_step, dst_data += dst_step)
+            {
+                vBGRtoBGR(src_data, dst_data, index_array_32, width, scn, dcn, vsize_pixels, 32);
+            }
+        }
+        else
+        {
+            for (int i = 0; i < height; i++, src_data += src_step, dst_data += dst_step)
+            {
+                vBGRtoBGR(src_data, dst_data, index_array_24, width, scn, dcn, vsize_pixels, 24);
+            }
+        }
+    }
+    else
+    {
+        for (int i = 0; i < height; i++, src_data += src_step, dst_data += dst_step)
+            sBGRtoBGR(src_data, dst_data, width, scn, dcn, blueIdx);
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+#endif // CV_HAL_RVV_071_ENABLED
+
+}}} // cv::rvv_hal::imgproc
diff --git a/hal/riscv-rvv/src/imgproc/common.hpp b/hal/riscv-rvv/src/imgproc/common.hpp
new file mode 100644
index 0000000000..819b43421c
--- /dev/null
+++ b/hal/riscv-rvv/src/imgproc/common.hpp
@@ -0,0 +1,76 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2025, SpaceMIT Inc., all rights reserved.
+// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
+// Third party copyrights are property of their respective owners.
+
+#ifndef OPENCV_HAL_RVV_IMGPROC_COMMON_HPP_INCLUDED
+#define OPENCV_HAL_RVV_IMGPROC_COMMON_HPP_INCLUDED
+
+#include "opencv2/core/hal/interface.h"
+#include "opencv2/imgproc/hal/interface.h"
+
+namespace cv { namespace rvv_hal { namespace imgproc { namespace common {
+
+inline int borderInterpolate( int p, int len, int borderType )
+{
+    if ((unsigned)p < (unsigned)len)
+        ;
+    else if (borderType == CV_HAL_BORDER_REPLICATE)
+        p = p < 0 ? 0 : len - 1;
+    else if (borderType == CV_HAL_BORDER_REFLECT || borderType == CV_HAL_BORDER_REFLECT_101)
+    {
+        int delta = borderType == CV_HAL_BORDER_REFLECT_101;
+        if (len == 1)
+            return 0;
+        do
+        {
+            if (p < 0)
+                p = -p - 1 + delta;
+            else
+                p = len - 1 - (p - len) - delta;
+        }
+        while( (unsigned)p >= (unsigned)len );
+    }
+    else if (borderType == CV_HAL_BORDER_WRAP)
+    {
+        if (p < 0)
+            p -= ((p-len+1)/len)*len;
+        if (p >= len)
+            p %= len;
+    }
+    else if (borderType == CV_HAL_BORDER_CONSTANT)
+        p = -1;
+    return p;
+}
+
+class FilterInvoker : public ParallelLoopBody
+{
+public:
+    template<typename... Args>
+    FilterInvoker(std::function<int(int, int, Args...)> _func, Args&&... args)
+    {
+        func = std::bind(_func, std::placeholders::_1, std::placeholders::_2, std::forward<Args>(args)...);
+    }
+
+    virtual void operator()(const Range& range) const override
+    {
+        func(range.start, range.end);
+    }
+
+private:
+    std::function<int(int, int)> func;
+};
+
+template<typename... Args>
+inline int invoke(int height, std::function<int(int, int, Args...)> func, Args&&... args)
+{
+    cv::parallel_for_(Range(1, height), FilterInvoker(func, std::forward<Args>(args)...), cv::getNumThreads());
+    return func(0, 1, std::forward<Args>(args)...);
+}
+
+}}}} // cv::rvv_hal::imgproc::common
+
+#endif // OPENCV_HAL_RVV_IMGPROC_COMMON_HPP_INCLUDED
diff --git a/hal/riscv-rvv/src/imgproc/filter.cpp b/hal/riscv-rvv/src/imgproc/filter.cpp
new file mode 100644
index 0000000000..f23b56e01d
--- /dev/null
+++ b/hal/riscv-rvv/src/imgproc/filter.cpp
@@ -0,0 +1,264 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
+
+#include "rvv_hal.hpp"
+#include "common.hpp"
+
+namespace cv { namespace rvv_hal { namespace imgproc {
+
+#if CV_HAL_RVV_1P0_ENABLED
+
+namespace {
+
+struct Filter2D
+{
+    const uchar* kernel_data;
+    size_t kernel_step;
+    int kernel_type;
+    int kernel_width;
+    int kernel_height;
+    int src_type;
+    int dst_type;
+    int borderType;
+    double delta;
+    int anchor_x;
+    int anchor_y;
+};
+
+static void process3(int anchor, int left, int right, float delta, const float* kernel, const uchar* row0, const uchar* row1, const uchar* row2, uchar* dst)
+{
+    int vl;
+    for (int i = left; i < right; i += vl)
+    {
+        vl = __riscv_vsetvl_e8m1(right - i);
+        auto s0 = __riscv_vfmv_v_f_f32m4(delta, vl);
+        auto s1 = __riscv_vfmv_v_f_f32m4(delta, vl);
+        auto s2 = __riscv_vfmv_v_f_f32m4(delta, vl);
+        auto s3 = __riscv_vfmv_v_f_f32m4(delta, vl);
+
+        auto addshift = [&](vfloat32m4_t a, vfloat32m4_t b, float k0, float k1, float k2, float r1, float r2) {
+            a = __riscv_vfmacc(a, k0, b, vl);
+            b = __riscv_vfslide1down(b, r1, vl);
+            a = __riscv_vfmacc(a, k1, b, vl);
+            b = __riscv_vfslide1down(b, r2, vl);
+            return __riscv_vfmacc(a, k2, b, vl);
+        };
+        auto loadsrc = [&](const uchar* row, float k0, float k1, float k2) {
+            if (!row) return;
+
+            const uchar* extra = row + (i - anchor) * 4;
+            auto src = __riscv_vlseg4e8_v_u8m1x4(extra, vl);
+            auto v0 = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vget_v_u8m1x4_u8m1(src, 0), vl), vl);
+            auto v1 = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vget_v_u8m1x4_u8m1(src, 1), vl), vl);
+            auto v2 = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vget_v_u8m1x4_u8m1(src, 2), vl), vl);
+            auto v3 = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vget_v_u8m1x4_u8m1(src, 3), vl), vl);
+
+            extra += vl * 4;
+            s0 = addshift(s0, v0, k0, k1, k2, extra[0], extra[4]);
+            s1 = addshift(s1, v1, k0, k1, k2, extra[1], extra[5]);
+            s2 = addshift(s2, v2, k0, k1, k2, extra[2], extra[6]);
+            s3 = addshift(s3, v3, k0, k1, k2, extra[3], extra[7]);
+        };
+
+        loadsrc(row0, kernel[0], kernel[1], kernel[2]);
+        loadsrc(row1, kernel[3], kernel[4], kernel[5]);
+        loadsrc(row2, kernel[6], kernel[7], kernel[8]);
+        vuint8m1x4_t val{};
+        val = __riscv_vset_v_u8m1_u8m1x4(val, 0, __riscv_vnclipu(__riscv_vfncvt_xu(s0, vl), 0, __RISCV_VXRM_RNU, vl));
+        val = __riscv_vset_v_u8m1_u8m1x4(val, 1, __riscv_vnclipu(__riscv_vfncvt_xu(s1, vl), 0, __RISCV_VXRM_RNU, vl));
+        val = __riscv_vset_v_u8m1_u8m1x4(val, 2, __riscv_vnclipu(__riscv_vfncvt_xu(s2, vl), 0, __RISCV_VXRM_RNU, vl));
+        val = __riscv_vset_v_u8m1_u8m1x4(val, 3, __riscv_vnclipu(__riscv_vfncvt_xu(s3, vl), 0, __RISCV_VXRM_RNU, vl));
+        __riscv_vsseg4e8(dst + i * 4, val, vl);
+    }
+}
+
+static void process5(int anchor, int left, int right, float delta, const float* kernel, const uchar* row0, const uchar* row1, const uchar* row2, const uchar* row3, const uchar* row4, uchar* dst)
+{
+    int vl;
+    for (int i = left; i < right; i += vl)
+    {
+        vl = __riscv_vsetvl_e8m1(right - i);
+        auto s0 = __riscv_vfmv_v_f_f32m4(delta, vl);
+        auto s1 = __riscv_vfmv_v_f_f32m4(delta, vl);
+        auto s2 = __riscv_vfmv_v_f_f32m4(delta, vl);
+        auto s3 = __riscv_vfmv_v_f_f32m4(delta, vl);
+
+        auto addshift = [&](vfloat32m4_t a, vfloat32m4_t b, float k0, float k1, float k2, float k3, float k4, float r1, float r2, float r3, float r4) {
+            a = __riscv_vfmacc(a, k0, b, vl);
+            b = __riscv_vfslide1down(b, r1, vl);
+            a = __riscv_vfmacc(a, k1, b, vl);
+            b = __riscv_vfslide1down(b, r2, vl);
+            a = __riscv_vfmacc(a, k2, b, vl);
+            b = __riscv_vfslide1down(b, r3, vl);
+            a = __riscv_vfmacc(a, k3, b, vl);
+            b = __riscv_vfslide1down(b, r4, vl);
+            return __riscv_vfmacc(a, k4, b, vl);
+        };
+        auto loadsrc = [&](const uchar* row, float k0, float k1, float k2, float k3, float k4) {
+            if (!row) return;
+
+            const uchar* extra = row + (i - anchor) * 4;
+            auto src = __riscv_vlseg4e8_v_u8m1x4(extra, vl);
+            auto v0 = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vget_v_u8m1x4_u8m1(src, 0), vl), vl);
+            auto v1 = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vget_v_u8m1x4_u8m1(src, 1), vl), vl);
+            auto v2 = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vget_v_u8m1x4_u8m1(src, 2), vl), vl);
+            auto v3 = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vget_v_u8m1x4_u8m1(src, 3), vl), vl);
+
+            extra += vl * 4;
+            s0 = addshift(s0, v0, k0, k1, k2, k3, k4, extra[0], extra[4], extra[ 8], extra[12]);
+            s1 = addshift(s1, v1, k0, k1, k2, k3, k4, extra[1], extra[5], extra[ 9], extra[13]);
+            s2 = addshift(s2, v2, k0, k1, k2, k3, k4, extra[2], extra[6], extra[10], extra[14]);
+            s3 = addshift(s3, v3, k0, k1, k2, k3, k4, extra[3], extra[7], extra[11], extra[15]);
+        };
+
+        loadsrc(row0, kernel[ 0], kernel[ 1], kernel[ 2], kernel[ 3], kernel[ 4]);
+        loadsrc(row1, kernel[ 5], kernel[ 6], kernel[ 7], kernel[ 8], kernel[ 9]);
+        loadsrc(row2, kernel[10], kernel[11], kernel[12], kernel[13], kernel[14]);
+        loadsrc(row3, kernel[15], kernel[16], kernel[17], kernel[18], kernel[19]);
+        loadsrc(row4, kernel[20], kernel[21], kernel[22], kernel[23], kernel[24]);
+        vuint8m1x4_t val{};
+        val = __riscv_vset_v_u8m1_u8m1x4(val, 0, __riscv_vnclipu(__riscv_vfncvt_xu(s0, vl), 0, __RISCV_VXRM_RNU, vl));
+        val = __riscv_vset_v_u8m1_u8m1x4(val, 1, __riscv_vnclipu(__riscv_vfncvt_xu(s1, vl), 0, __RISCV_VXRM_RNU, vl));
+        val = __riscv_vset_v_u8m1_u8m1x4(val, 2, __riscv_vnclipu(__riscv_vfncvt_xu(s2, vl), 0, __RISCV_VXRM_RNU, vl));
+        val = __riscv_vset_v_u8m1_u8m1x4(val, 3, __riscv_vnclipu(__riscv_vfncvt_xu(s3, vl), 0, __RISCV_VXRM_RNU, vl));
+        __riscv_vsseg4e8(dst + i * 4, val, vl);
+    }
+}
+
+// the algorithm is copied from 3rdparty/carotene/src/convolution.cpp,
+// in the function void CAROTENE_NS::convolution
+template<int ksize>
+static inline int filter(int start, int end, Filter2D* data, const uchar* src_data, size_t src_step, uchar* dst_data, int width, int height, int full_width, int full_height, int offset_x, int offset_y)
+{
+    float kernel[ksize * ksize];
+    for (int i = 0; i < ksize * ksize; i++)
+    {
+        kernel[i] = reinterpret_cast<const float*>(data->kernel_data + (i / ksize) * data->kernel_step)[i % ksize];
+    }
+
+    constexpr int noval = std::numeric_limits<int>::max();
+    auto access = [&](int x, int y) {
+        int pi, pj;
+        if (data->borderType & BORDER_ISOLATED)
+        {
+            pi = common::borderInterpolate(x - data->anchor_y, height, data->borderType & ~BORDER_ISOLATED);
+            pj = common::borderInterpolate(y - data->anchor_x, width , data->borderType & ~BORDER_ISOLATED);
+            pi = pi < 0 ? noval : pi;
+            pj = pj < 0 ? noval : pj;
+        }
+        else
+        {
+            pi = common::borderInterpolate(offset_y + x - data->anchor_y, full_height, data->borderType);
+            pj = common::borderInterpolate(offset_x + y - data->anchor_x, full_width , data->borderType);
+            pi = pi < 0 ? noval : pi - offset_y;
+            pj = pj < 0 ? noval : pj - offset_x;
+        }
+        return std::make_pair(pi, pj);
+    };
+
+    auto process = [&](int x, int y) {
+        float sum0, sum1, sum2, sum3;
+        sum0 = sum1 = sum2 = sum3 = data->delta;
+        for (int i = 0; i < ksize * ksize; i++)
+        {
+            auto p = access(x + i / ksize, y + i % ksize);
+            if (p.first != noval && p.second != noval)
+            {
+                sum0 += kernel[i] * src_data[p.first * src_step + p.second * 4    ];
+                sum1 += kernel[i] * src_data[p.first * src_step + p.second * 4 + 1];
+                sum2 += kernel[i] * src_data[p.first * src_step + p.second * 4 + 2];
+                sum3 += kernel[i] * src_data[p.first * src_step + p.second * 4 + 3];
+            }
+        }
+        dst_data[(x * width + y) * 4    ] = std::max(0, std::min((int)std::round(sum0), (int)std::numeric_limits<uchar>::max()));
+        dst_data[(x * width + y) * 4 + 1] = std::max(0, std::min((int)std::round(sum1), (int)std::numeric_limits<uchar>::max()));
+        dst_data[(x * width + y) * 4 + 2] = std::max(0, std::min((int)std::round(sum2), (int)std::numeric_limits<uchar>::max()));
+        dst_data[(x * width + y) * 4 + 3] = std::max(0, std::min((int)std::round(sum3), (int)std::numeric_limits<uchar>::max()));
+    };
+
+    const int left = data->anchor_x, right = width - (ksize - 1 - data->anchor_x);
+    for (int i = start; i < end; i++)
+    {
+        if (left >= right)
+        {
+            for (int j = 0; j < width; j++)
+                process(i, j);
+        }
+        else
+        {
+            for (int j = 0; j < left; j++)
+                process(i, j);
+            for (int j = right; j < width; j++)
+                process(i, j);
+
+            const uchar* row0 = access(i    , 0).first == noval ? nullptr : src_data + access(i    , 0).first * src_step;
+            const uchar* row1 = access(i + 1, 0).first == noval ? nullptr : src_data + access(i + 1, 0).first * src_step;
+            const uchar* row2 = access(i + 2, 0).first == noval ? nullptr : src_data + access(i + 2, 0).first * src_step;
+            if (ksize == 3)
+            {
+                process3(data->anchor_x, left, right, data->delta, kernel, row0, row1, row2, dst_data + i * width * 4);
+            }
+            else
+            {
+                const uchar* row3 = access(i + 3, 0).first == noval ? nullptr : src_data + access(i + 3, 0).first * src_step;
+                const uchar* row4 = access(i + 4, 0).first == noval ? nullptr : src_data + access(i + 4, 0).first * src_step;
+                process5(data->anchor_x, left, right, data->delta, kernel, row0, row1, row2, row3, row4, dst_data + i * width * 4);
+            }
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+} // anonymous
+
+int filterInit(cvhalFilter2D** context, uchar* kernel_data, size_t kernel_step, int kernel_type, int kernel_width, int kernel_height, int /*max_width*/, int /*max_height*/, int src_type, int dst_type, int borderType, double delta, int anchor_x, int anchor_y, bool /*allowSubmatrix*/, bool /*allowInplace*/)
+{
+    if (kernel_type != CV_32FC1 || src_type != CV_8UC4 || dst_type != CV_8UC4)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    if (kernel_width != kernel_height)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    if (kernel_width != 3 && kernel_width != 5)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    if ((borderType & ~BORDER_ISOLATED) == BORDER_WRAP)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    anchor_x = anchor_x < 0 ? kernel_width  / 2 : anchor_x;
+    anchor_y = anchor_y < 0 ? kernel_height / 2 : anchor_y;
+    *context = reinterpret_cast<cvhalFilter2D*>(new Filter2D{kernel_data, kernel_step, kernel_type, kernel_width, kernel_height, src_type, dst_type, borderType, delta, anchor_x, anchor_y});
+    return CV_HAL_ERROR_OK;
+}
+
+int filter(cvhalFilter2D* context, uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int full_width, int full_height, int offset_x, int offset_y)
+{
+    Filter2D* data = reinterpret_cast<Filter2D*>(context);
+    std::vector<uchar> dst(width * height * 4);
+
+    int res = CV_HAL_ERROR_NOT_IMPLEMENTED;
+    switch (data->kernel_width)
+    {
+    case 3:
+        res = common::invoke(height, {filter<3>}, data, src_data, src_step, dst.data(), width, height, full_width, full_height, offset_x, offset_y);
+        break;
+    case 5:
+        res = common::invoke(height, {filter<5>}, data, src_data, src_step, dst.data(), width, height, full_width, full_height, offset_x, offset_y);
+        break;
+    }
+
+    for (int i = 0; i < height; i++)
+        memcpy(dst_data + i * dst_step, dst.data() + i * width * 4, width * 4);
+    return res;
+}
+
+int filterFree(cvhalFilter2D* context)
+{
+    delete reinterpret_cast<Filter2D*>(context);
+    return CV_HAL_ERROR_OK;
+}
+
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+}}} // cv::rvv_hal::imgproc
diff --git a/hal/riscv-rvv/src/imgproc/gaussian_blur.cpp b/hal/riscv-rvv/src/imgproc/gaussian_blur.cpp
new file mode 100644
index 0000000000..495efa4ee7
--- /dev/null
+++ b/hal/riscv-rvv/src/imgproc/gaussian_blur.cpp
@@ -0,0 +1,389 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
+
+#include "rvv_hal.hpp"
+#include "common.hpp"
+
+namespace cv { namespace rvv_hal { namespace imgproc {
+
+#if CV_HAL_RVV_1P0_ENABLED
+
+namespace {
+
+// the algorithm is same as cv_hal_sepFilter
+template<int ksize, typename helperT, typename helperWT>
+static inline int gaussianBlurC1(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int full_width, int full_height, int offset_x, int offset_y, int border_type)
+{
+    using T = typename helperT::ElemType;
+    using WT = typename helperWT::ElemType;
+
+    constexpr int noval = std::numeric_limits<int>::max();
+    auto accessX = [&](int x) {
+        int pi = common::borderInterpolate(offset_y + x - ksize / 2, full_height, border_type); // [TODO] fix dependencies
+        return pi < 0 ? noval : pi - offset_y;
+    };
+    auto accessY = [&](int y) {
+        int pj = common::borderInterpolate(offset_x + y - ksize / 2, full_width, border_type);
+        return pj < 0 ? noval : pj - offset_x;
+    };
+    auto p2idx = [&](int x, int y){ return (x + ksize) % ksize * width + y; };
+
+    constexpr uint kernel[2][5] = {{1, 2, 1}, {1, 4, 6, 4, 1}};
+    std::vector<WT> res(width * ksize);
+    auto process = [&](int x, int y) {
+        WT sum = 0;
+        for (int i = 0; i < ksize; i++)
+        {
+            int p = accessY(y + i);
+            if (p != noval)
+            {
+                sum += kernel[ksize == 5][i] * static_cast<WT>(reinterpret_cast<const T*>(src_data + x * src_step)[p]);
+            }
+        }
+        res[p2idx(x, y)] = sum;
+    };
+
+    const int left = ksize / 2, right = width - ksize / 2;
+    for (int i = start - ksize / 2; i < end + ksize / 2; i++)
+    {
+        if (i + offset_y >= 0 && i + offset_y < full_height)
+        {
+            if (left >= right)
+            {
+                for (int j = 0; j < width; j++)
+                    process(i, j);
+            }
+            else
+            {
+                for (int j = 0; j < left; j++)
+                    process(i, j);
+                for (int j = right; j < width; j++)
+                    process(i, j);
+
+                int vl;
+                for (int j = left; j < right; j += vl)
+                {
+                    vl = helperT::setvl(right - j);
+                    const T* extra = reinterpret_cast<const T*>(src_data + i * src_step) + j - ksize / 2;
+                    auto src = __riscv_vzext_vf2(helperT::vload(extra, vl), vl);
+
+                    extra += vl;
+                    auto sum = src;
+                    if (ksize == 3)
+                    {
+                        src = __riscv_vslide1down(src, extra[0], vl);
+                        sum = __riscv_vadd(sum, __riscv_vsll(src, 1, vl), vl);
+                        src = __riscv_vslide1down(src, extra[1], vl);
+                        sum = __riscv_vadd(sum, src, vl);
+                    }
+                    else
+                    {
+                        src = __riscv_vslide1down(src, extra[0], vl);
+                        sum = __riscv_vadd(sum, __riscv_vsll(src, 2, vl), vl);
+                        src = __riscv_vslide1down(src, extra[1], vl);
+                        sum = __riscv_vadd(sum, __riscv_vadd(__riscv_vsll(src, 1, vl), __riscv_vsll(src, 2, vl), vl), vl);
+                        src = __riscv_vslide1down(src, extra[2], vl);
+                        sum = __riscv_vadd(sum, __riscv_vsll(src, 2, vl), vl);
+                        src = __riscv_vslide1down(src, extra[3], vl);
+                        sum = __riscv_vadd(sum, src, vl);
+                    }
+                    helperWT::vstore(res.data() + p2idx(i, j), sum, vl);
+                }
+            }
+        }
+
+        int cur = i - ksize / 2;
+        if (cur >= start)
+        {
+            const WT* row0 = accessX(cur    ) == noval ? nullptr : res.data() + p2idx(accessX(cur    ), 0);
+            const WT* row1 = accessX(cur + 1) == noval ? nullptr : res.data() + p2idx(accessX(cur + 1), 0);
+            const WT* row2 = accessX(cur + 2) == noval ? nullptr : res.data() + p2idx(accessX(cur + 2), 0);
+            const WT* row3 = nullptr, *row4 = nullptr;
+            if (ksize == 5)
+            {
+                row3 = accessX(cur + 3) == noval ? nullptr : res.data() + p2idx(accessX(cur + 3), 0);
+                row4 = accessX(cur + 4) == noval ? nullptr : res.data() + p2idx(accessX(cur + 4), 0);
+            }
+
+            int vl;
+            for (int j = 0; j < width; j += vl)
+            {
+                vl = helperWT::setvl(width - j);
+                auto v0 = row0 ? helperWT::vload(row0 + j, vl) : helperWT::vmv(0, vl);
+                auto v1 = row1 ? helperWT::vload(row1 + j, vl) : helperWT::vmv(0, vl);
+                auto v2 = row2 ? helperWT::vload(row2 + j, vl) : helperWT::vmv(0, vl);
+                typename helperWT::VecType sum;
+                if (ksize == 3)
+                {
+                    sum = __riscv_vadd(__riscv_vadd(v0, v2, vl), __riscv_vsll(v1, 1, vl), vl);
+                }
+                else
+                {
+                    sum = __riscv_vadd(v0, __riscv_vadd(__riscv_vsll(v2, 1, vl), __riscv_vsll(v2, 2, vl), vl), vl);
+                    auto v3 = row3 ? helperWT::vload(row3 + j, vl) : helperWT::vmv(0, vl);
+                    sum = __riscv_vadd(sum, __riscv_vsll(__riscv_vadd(v1, v3, vl), 2, vl), vl);
+                    auto v4 = row4 ? helperWT::vload(row4 + j, vl) : helperWT::vmv(0, vl);
+                    sum = __riscv_vadd(sum, v4, vl);
+                }
+                helperT::vstore(reinterpret_cast<T*>(dst_data + cur * dst_step) + j, __riscv_vnclipu(sum, ksize == 5 ? 8 : 4, __RISCV_VXRM_RNU, vl), vl);
+            }
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+template<int ksize>
+static inline int gaussianBlurC4(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int full_width, int full_height, int offset_x, int offset_y, int border_type)
+{
+    constexpr int noval = std::numeric_limits<int>::max();
+    auto accessX = [&](int x) {
+        int pi = common::borderInterpolate(offset_y + x - ksize / 2, full_height, border_type);
+        return pi < 0 ? noval : pi - offset_y;
+    };
+    auto accessY = [&](int y) {
+        int pj = common::borderInterpolate(offset_x + y - ksize / 2, full_width, border_type);
+        return pj < 0 ? noval : pj - offset_x;
+    };
+    auto p2idx = [&](int x, int y){ return ((x + ksize) % ksize * width + y) * 4; };
+
+    constexpr uint kernel[2][5] = {{1, 2, 1}, {1, 4, 6, 4, 1}};
+    std::vector<ushort> res(width * ksize * 4);
+    auto process = [&](int x, int y) {
+        ushort sum0, sum1, sum2, sum3;
+        sum0 = sum1 = sum2 = sum3 = 0;
+        for (int i = 0; i < ksize; i++)
+        {
+            int p = accessY(y + i);
+            if (p != noval)
+            {
+                sum0 += kernel[ksize == 5][i] * static_cast<ushort>((src_data + x * src_step)[p * 4    ]);
+                sum1 += kernel[ksize == 5][i] * static_cast<ushort>((src_data + x * src_step)[p * 4 + 1]);
+                sum2 += kernel[ksize == 5][i] * static_cast<ushort>((src_data + x * src_step)[p * 4 + 2]);
+                sum3 += kernel[ksize == 5][i] * static_cast<ushort>((src_data + x * src_step)[p * 4 + 3]);
+            }
+        }
+        res[p2idx(x, y)    ] = sum0;
+        res[p2idx(x, y) + 1] = sum1;
+        res[p2idx(x, y) + 2] = sum2;
+        res[p2idx(x, y) + 3] = sum3;
+    };
+
+    const int left = ksize / 2, right = width - ksize / 2;
+    for (int i = start - ksize / 2; i < end + ksize / 2; i++)
+    {
+        if (i + offset_y >= 0 && i + offset_y < full_height)
+        {
+            if (left >= right)
+            {
+                for (int j = 0; j < width; j++)
+                    process(i, j);
+            }
+            else
+            {
+                for (int j = 0; j < left; j++)
+                    process(i, j);
+                for (int j = right; j < width; j++)
+                    process(i, j);
+
+                int vl;
+                for (int j = left; j < right; j += vl)
+                {
+                    vl = __riscv_vsetvl_e8m1(right - j);
+                    const uchar* extra = src_data + i * src_step + (j - ksize / 2) * 4;
+                    auto src = __riscv_vlseg4e8_v_u8m1x4(extra, vl);
+                    auto src0 = __riscv_vzext_vf2(__riscv_vget_v_u8m1x4_u8m1(src, 0), vl);
+                    auto src1 = __riscv_vzext_vf2(__riscv_vget_v_u8m1x4_u8m1(src, 1), vl);
+                    auto src2 = __riscv_vzext_vf2(__riscv_vget_v_u8m1x4_u8m1(src, 2), vl);
+                    auto src3 = __riscv_vzext_vf2(__riscv_vget_v_u8m1x4_u8m1(src, 3), vl);
+
+                    extra += vl * 4;
+                    auto sum0 = src0, sum1 = src1, sum2 = src2, sum3 = src3;
+                    if (ksize == 3)
+                    {
+                        src0 = __riscv_vslide1down(src0, extra[0], vl);
+                        src1 = __riscv_vslide1down(src1, extra[1], vl);
+                        src2 = __riscv_vslide1down(src2, extra[2], vl);
+                        src3 = __riscv_vslide1down(src3, extra[3], vl);
+                        sum0 = __riscv_vadd(sum0, __riscv_vsll(src0, 1, vl), vl);
+                        sum1 = __riscv_vadd(sum1, __riscv_vsll(src1, 1, vl), vl);
+                        sum2 = __riscv_vadd(sum2, __riscv_vsll(src2, 1, vl), vl);
+                        sum3 = __riscv_vadd(sum3, __riscv_vsll(src3, 1, vl), vl);
+                        src0 = __riscv_vslide1down(src0, extra[4], vl);
+                        src1 = __riscv_vslide1down(src1, extra[5], vl);
+                        src2 = __riscv_vslide1down(src2, extra[6], vl);
+                        src3 = __riscv_vslide1down(src3, extra[7], vl);
+                        sum0 = __riscv_vadd(sum0, src0, vl);
+                        sum1 = __riscv_vadd(sum1, src1, vl);
+                        sum2 = __riscv_vadd(sum2, src2, vl);
+                        sum3 = __riscv_vadd(sum3, src3, vl);
+                    }
+                    else
+                    {
+                        src0 = __riscv_vslide1down(src0, extra[0], vl);
+                        src1 = __riscv_vslide1down(src1, extra[1], vl);
+                        src2 = __riscv_vslide1down(src2, extra[2], vl);
+                        src3 = __riscv_vslide1down(src3, extra[3], vl);
+                        sum0 = __riscv_vadd(sum0, __riscv_vsll(src0, 2, vl), vl);
+                        sum1 = __riscv_vadd(sum1, __riscv_vsll(src1, 2, vl), vl);
+                        sum2 = __riscv_vadd(sum2, __riscv_vsll(src2, 2, vl), vl);
+                        sum3 = __riscv_vadd(sum3, __riscv_vsll(src3, 2, vl), vl);
+                        src0 = __riscv_vslide1down(src0, extra[4], vl);
+                        src1 = __riscv_vslide1down(src1, extra[5], vl);
+                        src2 = __riscv_vslide1down(src2, extra[6], vl);
+                        src3 = __riscv_vslide1down(src3, extra[7], vl);
+                        sum0 = __riscv_vadd(sum0, __riscv_vadd(__riscv_vsll(src0, 1, vl), __riscv_vsll(src0, 2, vl), vl), vl);
+                        sum1 = __riscv_vadd(sum1, __riscv_vadd(__riscv_vsll(src1, 1, vl), __riscv_vsll(src1, 2, vl), vl), vl);
+                        sum2 = __riscv_vadd(sum2, __riscv_vadd(__riscv_vsll(src2, 1, vl), __riscv_vsll(src2, 2, vl), vl), vl);
+                        sum3 = __riscv_vadd(sum3, __riscv_vadd(__riscv_vsll(src3, 1, vl), __riscv_vsll(src3, 2, vl), vl), vl);
+                        src0 = __riscv_vslide1down(src0, extra[ 8], vl);
+                        src1 = __riscv_vslide1down(src1, extra[ 9], vl);
+                        src2 = __riscv_vslide1down(src2, extra[10], vl);
+                        src3 = __riscv_vslide1down(src3, extra[11], vl);
+                        sum0 = __riscv_vadd(sum0, __riscv_vsll(src0, 2, vl), vl);
+                        sum1 = __riscv_vadd(sum1, __riscv_vsll(src1, 2, vl), vl);
+                        sum2 = __riscv_vadd(sum2, __riscv_vsll(src2, 2, vl), vl);
+                        sum3 = __riscv_vadd(sum3, __riscv_vsll(src3, 2, vl), vl);
+                        src0 = __riscv_vslide1down(src0, extra[12], vl);
+                        src1 = __riscv_vslide1down(src1, extra[13], vl);
+                        src2 = __riscv_vslide1down(src2, extra[14], vl);
+                        src3 = __riscv_vslide1down(src3, extra[15], vl);
+                        sum0 = __riscv_vadd(sum0, src0, vl);
+                        sum1 = __riscv_vadd(sum1, src1, vl);
+                        sum2 = __riscv_vadd(sum2, src2, vl);
+                        sum3 = __riscv_vadd(sum3, src3, vl);
+                    }
+
+                    vuint16m2x4_t dst{};
+                    dst = __riscv_vset_v_u16m2_u16m2x4(dst, 0, sum0);
+                    dst = __riscv_vset_v_u16m2_u16m2x4(dst, 1, sum1);
+                    dst = __riscv_vset_v_u16m2_u16m2x4(dst, 2, sum2);
+                    dst = __riscv_vset_v_u16m2_u16m2x4(dst, 3, sum3);
+                    __riscv_vsseg4e16(res.data() + p2idx(i, j), dst, vl);
+                }
+            }
+        }
+
+        int cur = i - ksize / 2;
+        if (cur >= start)
+        {
+            const ushort* row0 = accessX(cur    ) == noval ? nullptr : res.data() + p2idx(accessX(cur    ), 0);
+            const ushort* row1 = accessX(cur + 1) == noval ? nullptr : res.data() + p2idx(accessX(cur + 1), 0);
+            const ushort* row2 = accessX(cur + 2) == noval ? nullptr : res.data() + p2idx(accessX(cur + 2), 0);
+            const ushort* row3 = nullptr, *row4 = nullptr;
+            if (ksize == 5)
+            {
+                row3 = accessX(cur + 3) == noval ? nullptr : res.data() + p2idx(accessX(cur + 3), 0);
+                row4 = accessX(cur + 4) == noval ? nullptr : res.data() + p2idx(accessX(cur + 4), 0);
+            }
+
+            int vl;
+            for (int j = 0; j < width; j += vl)
+            {
+                vl = __riscv_vsetvl_e16m2(width - j);
+                vuint16m2_t sum0, sum1, sum2, sum3, src0{}, src1{}, src2{}, src3{};
+                sum0 = sum1 = sum2 = sum3 = __riscv_vmv_v_x_u16m2(0, vl);
+
+                auto loadres = [&](const ushort* row) {
+                    auto src = __riscv_vlseg4e16_v_u16m2x4(row + j * 4, vl);
+                    src0 = __riscv_vget_v_u16m2x4_u16m2(src, 0);
+                    src1 = __riscv_vget_v_u16m2x4_u16m2(src, 1);
+                    src2 = __riscv_vget_v_u16m2x4_u16m2(src, 2);
+                    src3 = __riscv_vget_v_u16m2x4_u16m2(src, 3);
+                };
+                if (row0)
+                {
+                    loadres(row0);
+                    sum0 = src0;
+                    sum1 = src1;
+                    sum2 = src2;
+                    sum3 = src3;
+                }
+                if (row1)
+                {
+                    loadres(row1);
+                    sum0 = __riscv_vadd(sum0, __riscv_vsll(src0, ksize == 5 ? 2 : 1, vl), vl);
+                    sum1 = __riscv_vadd(sum1, __riscv_vsll(src1, ksize == 5 ? 2 : 1, vl), vl);
+                    sum2 = __riscv_vadd(sum2, __riscv_vsll(src2, ksize == 5 ? 2 : 1, vl), vl);
+                    sum3 = __riscv_vadd(sum3, __riscv_vsll(src3, ksize == 5 ? 2 : 1, vl), vl);
+                }
+                if (row2)
+                {
+                    loadres(row2);
+                    if (ksize == 5)
+                    {
+                        src0 = __riscv_vadd(__riscv_vsll(src0, 1, vl), __riscv_vsll(src0, 2, vl), vl);
+                        src1 = __riscv_vadd(__riscv_vsll(src1, 1, vl), __riscv_vsll(src1, 2, vl), vl);
+                        src2 = __riscv_vadd(__riscv_vsll(src2, 1, vl), __riscv_vsll(src2, 2, vl), vl);
+                        src3 = __riscv_vadd(__riscv_vsll(src3, 1, vl), __riscv_vsll(src3, 2, vl), vl);
+                    }
+                    sum0 = __riscv_vadd(sum0, src0, vl);
+                    sum1 = __riscv_vadd(sum1, src1, vl);
+                    sum2 = __riscv_vadd(sum2, src2, vl);
+                    sum3 = __riscv_vadd(sum3, src3, vl);
+                }
+                if (row3)
+                {
+                    loadres(row3);
+                    sum0 = __riscv_vadd(sum0, __riscv_vsll(src0, 2, vl), vl);
+                    sum1 = __riscv_vadd(sum1, __riscv_vsll(src1, 2, vl), vl);
+                    sum2 = __riscv_vadd(sum2, __riscv_vsll(src2, 2, vl), vl);
+                    sum3 = __riscv_vadd(sum3, __riscv_vsll(src3, 2, vl), vl);
+                }
+                if (row4)
+                {
+                    loadres(row4);
+                    sum0 = __riscv_vadd(sum0, src0, vl);
+                    sum1 = __riscv_vadd(sum1, src1, vl);
+                    sum2 = __riscv_vadd(sum2, src2, vl);
+                    sum3 = __riscv_vadd(sum3, src3, vl);
+                }
+
+                vuint8m1x4_t dst{};
+                dst = __riscv_vset_v_u8m1_u8m1x4(dst, 0, __riscv_vnclipu(sum0, ksize == 5 ? 8 : 4, __RISCV_VXRM_RNU, vl));
+                dst = __riscv_vset_v_u8m1_u8m1x4(dst, 1, __riscv_vnclipu(sum1, ksize == 5 ? 8 : 4, __RISCV_VXRM_RNU, vl));
+                dst = __riscv_vset_v_u8m1_u8m1x4(dst, 2, __riscv_vnclipu(sum2, ksize == 5 ? 8 : 4, __RISCV_VXRM_RNU, vl));
+                dst = __riscv_vset_v_u8m1_u8m1x4(dst, 3, __riscv_vnclipu(sum3, ksize == 5 ? 8 : 4, __RISCV_VXRM_RNU, vl));
+                __riscv_vsseg4e8(dst_data + cur * dst_step + j * 4, dst, vl);
+            }
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+} // anonymous
+
+int gaussianBlurBinomial(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int depth, int cn, size_t margin_left, size_t margin_top, size_t margin_right, size_t margin_bottom, size_t ksize, int border_type)
+{
+    const int type = CV_MAKETYPE(depth, cn);
+    if ((type != CV_8UC1 && type != CV_8UC4 && type != CV_16UC1) || src_data == dst_data)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    if ((ksize != 3 && ksize != 5) || border_type & BORDER_ISOLATED || border_type == BORDER_WRAP)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    switch (ksize*100 + type)
+    {
+    case 300 + CV_8UC1:
+        return common::invoke(height, {gaussianBlurC1<3, RVV_U8M4, RVV_U16M8>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, border_type);
+    case 500 + CV_8UC1:
+        return common::invoke(height, {gaussianBlurC1<5, RVV_U8M4, RVV_U16M8>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, border_type);
+    case 300 + CV_16UC1:
+        return common::invoke(height, {gaussianBlurC1<3, RVV_U16M4, RVV_U32M8>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, border_type);
+    case 500 + CV_16UC1:
+        return common::invoke(height, {gaussianBlurC1<5, RVV_U16M4, RVV_U32M8>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, border_type);
+    case 300 + CV_8UC4:
+        return common::invoke(height, {gaussianBlurC4<3>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, border_type);
+    case 500 + CV_8UC4:
+        return common::invoke(height, {gaussianBlurC4<5>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, border_type);
+    }
+
+    return CV_HAL_ERROR_NOT_IMPLEMENTED;
+}
+
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+}}} // cv::rvv_hal::imgproc
diff --git a/hal/riscv-rvv/src/imgproc/histogram.cpp b/hal/riscv-rvv/src/imgproc/histogram.cpp
new file mode 100644
index 0000000000..eb496f0709
--- /dev/null
+++ b/hal/riscv-rvv/src/imgproc/histogram.cpp
@@ -0,0 +1,282 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
+// Copyright (C) 2025, SpaceMIT Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+
+#include "rvv_hal.hpp"
+#include <cstring>
+#include <vector>
+
+namespace cv { namespace rvv_hal { namespace imgproc {
+
+#if CV_HAL_RVV_1P0_ENABLED
+
+namespace {
+
+class HistogramInvoker : public ParallelLoopBody
+{
+public:
+    template<typename... Args>
+    HistogramInvoker(std::function<void(int, int, Args...)> _func, Args&&... args)
+    {
+        func = std::bind(_func, std::placeholders::_1, std::placeholders::_2, std::forward<Args>(args)...);
+    }
+
+    virtual void operator()(const Range& range) const override
+    {
+        func(range.start, range.end);
+    }
+
+private:
+    std::function<void(int, int)> func;
+};
+
+constexpr int HIST_SZ = std::numeric_limits<uchar>::max() + 1;
+
+static inline void hist_invoke(int start, int end, const uchar* src_data, size_t src_step, int width, int* hist, std::mutex* m)
+{
+    int h[HIST_SZ] = {0};
+    for (int i = start; i < end; i++)
+    {
+        const uchar* src = src_data + i * src_step;
+        int j;
+        for (j = 0; j + 3 < width; j += 4)
+        {
+            int t0 = src[j], t1 = src[j+1];
+            h[t0]++; h[t1]++;
+            t0 = src[j+2]; t1 = src[j+3];
+            h[t0]++; h[t1]++;
+        }
+        for (; j < width; j++)
+        {
+            h[src[j]]++;
+        }
+    }
+
+    std::lock_guard<std::mutex> lk(*m);
+    for (int i = 0; i < HIST_SZ; i++)
+    {
+        hist[i] += h[i];
+    }
+}
+
+static inline void lut_invoke(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, const uchar* lut)
+{
+    for (int i = start; i < end; i++)
+    {
+        int vl;
+        for (int j = 0; j < width; j += vl)
+        {
+            vl = __riscv_vsetvl_e8m8(width - j);
+            auto src = __riscv_vle8_v_u8m8(src_data + i * src_step + j, vl);
+            auto dst = __riscv_vloxei8_v_u8m8(lut, src, vl);
+            __riscv_vse8(dst_data + i * dst_step + j, dst, vl);
+        }
+    }
+}
+
+} // equalize_hist
+
+// the algorithm is copied from imgproc/src/histogram.cpp,
+// in the function void cv::equalizeHist
+int equalize_hist(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height)
+{
+    int hist[HIST_SZ] = {0};
+    uchar lut[HIST_SZ];
+
+    std::mutex m;
+    cv::parallel_for_(Range(0, height), HistogramInvoker({hist_invoke}, src_data, src_step, width, reinterpret_cast<int *>(hist), &m), static_cast<double>(width * height) / (1 << 15));
+
+    int i = 0;
+    while (!hist[i]) ++i;
+
+    float scale = (HIST_SZ - 1.f)/(width * height - hist[i]);
+    int sum = 0;
+    for (lut[i++] = 0; i < HIST_SZ; i++)
+    {
+        sum += hist[i];
+        lut[i] = std::min(std::max(static_cast<int>(std::round(sum * scale)), 0), HIST_SZ - 1);
+    }
+    cv::parallel_for_(Range(0, height), HistogramInvoker({lut_invoke}, src_data, src_step, dst_data, dst_step, width, reinterpret_cast<const uchar*>(lut)), static_cast<double>(width * height) / (1 << 15));
+
+    return CV_HAL_ERROR_OK;
+}
+
+// ############ calc_hist ############
+
+namespace {
+
+constexpr int MAX_VLEN = 1024;
+constexpr int MAX_E8M1 = MAX_VLEN / 8;
+
+inline void cvt_32s32f(const int* ihist, float* fhist, int hist_size) {
+    int vl;
+    for (int i = 0; i < hist_size; i += vl) {
+        vl = __riscv_vsetvl_e32m8(hist_size - i);
+        auto iv = __riscv_vle32_v_i32m8(ihist + i, vl);
+        __riscv_vse32(fhist + i, __riscv_vfcvt_f(iv, vl), vl);
+    }
+}
+
+inline void cvt32s32f_add32f(const int* ihist, float* fhist, int hist_size) {
+    int vl;
+    for (int i = 0; i < hist_size; i += vl) {
+        vl = __riscv_vsetvl_e32m8(hist_size - i);
+        auto iv = __riscv_vle32_v_i32m8(ihist + i, vl);
+        auto fv = __riscv_vle32_v_f32m8(fhist + i, vl);
+        auto s = __riscv_vfadd(__riscv_vfcvt_f(iv, vl), fv, vl);
+        __riscv_vse32(fhist + i, s, vl);
+    }
+}
+
+}
+
+int calc_hist(const uchar* src_data, size_t src_step, int src_type, int src_width, int src_height,
+              float* hist_data, int hist_size, const float** ranges, bool uniform, bool accumulate) {
+    int depth = CV_MAT_DEPTH(src_type), cn = CV_MAT_CN(src_type);
+
+    // [TODO] support non-uniform
+    // In case of CV_8U, it is already fast enough with lut
+    if ((depth != CV_16U && depth != CV_32F) || !uniform) {
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    }
+
+    std::vector<int> buf_ihist(hist_size+1, 0);
+    int* ihist = buf_ihist.data();
+
+    double low = ranges[0][0], high = ranges[0][1];
+    double t = hist_size / (high - low);
+    double a = t, b = -t * low;
+    double v0_lo = low, v0_hi = high;
+
+    int sz = hist_size, d0 = cn, step0 = (int)(src_step / CV_ELEM_SIZE1(src_type));
+    int buf_idx[MAX_E8M1];
+
+    if (depth == CV_16U) {
+        const ushort* p0 = (const ushort*)src_data;
+        if (d0 == 1) {
+            while (src_height--) {
+                int vl;
+                for (int x = 0; x < src_width; x += vl) {
+                    vl = __riscv_vsetvl_e16m2(src_width - x);
+
+                    auto v = __riscv_vfcvt_f(__riscv_vwcvtu_x(__riscv_vwcvtu_x(__riscv_vle16_v_u16m2(p0 + x, vl), vl), vl), vl);
+
+                    auto m0 = __riscv_vmflt(v, v0_lo, vl);
+                    auto m1 = __riscv_vmfge(v, v0_hi, vl);
+                    auto m = __riscv_vmor(m0, m1, vl);
+
+                    auto fidx = __riscv_vfadd(__riscv_vfmul(v, a, vl), b, vl);
+                    auto idx = __riscv_vfncvt_x(__riscv_vfsub(fidx, 0.5f - 1e-6, vl), vl);
+                    idx = __riscv_vmerge(idx, 0, __riscv_vmslt(idx, 0, vl), vl);
+                    idx = __riscv_vmerge(idx, sz-1, __riscv_vmsgt(idx, sz-1, vl), vl);
+                    idx = __riscv_vmerge(idx, -1, m, vl);
+                    __riscv_vse32(buf_idx, idx, vl);
+
+                    for (int i = 0; i < vl; i++) {
+                        int _idx = buf_idx[i] + 1;
+                        ihist[_idx]++;
+                    }
+                }
+                p0 += step0;
+            }
+        } else {
+            while (src_height--) {
+                int vl;
+                for (int x = 0; x < src_width; x += vl) {
+                    vl = __riscv_vsetvl_e16m2(src_width - x);
+
+                    auto v = __riscv_vfcvt_f(__riscv_vwcvtu_x(__riscv_vwcvtu_x(__riscv_vlse16_v_u16m2(p0 + x*d0, sizeof(ushort)*d0, vl), vl), vl), vl);
+
+                    auto m0 = __riscv_vmflt(v, v0_lo, vl);
+                    auto m1 = __riscv_vmfge(v, v0_hi, vl);
+                    auto m = __riscv_vmor(m0, m1, vl);
+
+                    auto fidx = __riscv_vfadd(__riscv_vfmul(v, a, vl), b, vl);
+                    auto idx = __riscv_vfncvt_x(__riscv_vfsub(fidx, 0.5f - 1e-6, vl), vl);
+                    idx = __riscv_vmerge(idx, 0, __riscv_vmslt(idx, 0, vl), vl);
+                    idx = __riscv_vmerge(idx, sz-1, __riscv_vmsgt(idx, sz-1, vl), vl);
+                    idx = __riscv_vmerge(idx, -1, m, vl);
+                    __riscv_vse32(buf_idx, idx, vl);
+
+                    for (int i = 0; i < vl; i++) {
+                        int _idx = buf_idx[i] + 1;
+                        ihist[_idx]++;
+                    }
+                }
+                p0 += step0;
+            }
+        }
+    } else if (depth == CV_32F) {
+        const float* p0 = (const float*)src_data;
+        if (d0 == 1) {
+            while (src_height--) {
+                int vl;
+                for (int x = 0; x < src_width; x += vl) {
+                    vl = __riscv_vsetvl_e32m4(src_width - x);
+
+                    auto v = __riscv_vfwcvt_f(__riscv_vle32_v_f32m4(p0 + x, vl), vl);
+
+                    auto m0 = __riscv_vmflt(v, v0_lo, vl);
+                    auto m1 = __riscv_vmfge(v, v0_hi, vl);
+                    auto m = __riscv_vmor(m0, m1, vl);
+
+                    auto fidx = __riscv_vfadd(__riscv_vfmul(v, a, vl), b, vl);
+                    auto idx = __riscv_vfncvt_x(__riscv_vfsub(fidx, 0.5f - 1e-6, vl), vl);
+                    idx = __riscv_vmerge(idx, 0, __riscv_vmslt(idx, 0, vl), vl);
+                    idx = __riscv_vmerge(idx, sz-1, __riscv_vmsgt(idx, sz-1, vl), vl);
+                    idx = __riscv_vmerge(idx, -1, m, vl);
+                    __riscv_vse32(buf_idx, idx, vl);
+
+                    for (int i = 0; i < vl; i++) {
+                        int _idx = buf_idx[i] + 1;
+                        ihist[_idx]++;
+                    }
+                }
+                p0 += step0;
+            }
+        } else {
+            while (src_height--) {
+                int vl;
+                for (int x = 0; x < src_width; x += vl) {
+                    vl = __riscv_vsetvl_e32m4(src_width - x);
+
+                    auto v = __riscv_vfwcvt_f(__riscv_vlse32_v_f32m4(p0 + x*d0, sizeof(float)*d0, vl), vl);
+
+                    auto m0 = __riscv_vmflt(v, v0_lo, vl);
+                    auto m1 = __riscv_vmfge(v, v0_hi, vl);
+                    auto m = __riscv_vmor(m0, m1, vl);
+
+                    auto fidx = __riscv_vfadd(__riscv_vfmul(v, a, vl), b, vl);
+                    auto idx = __riscv_vfncvt_x(__riscv_vfsub(fidx, 0.5f - 1e-6, vl), vl);
+                    idx = __riscv_vmerge(idx, 0, __riscv_vmslt(idx, 0, vl), vl);
+                    idx = __riscv_vmerge(idx, sz-1, __riscv_vmsgt(idx, sz-1, vl), vl);
+                    idx = __riscv_vmerge(idx, -1, m, vl);
+                    __riscv_vse32(buf_idx, idx, vl);
+
+                    for (int i = 0; i < vl; i++) {
+                        int _idx = buf_idx[i] + 1;
+                        ihist[_idx]++;
+                    }
+                }
+                p0 += step0;
+            }
+        }
+    }
+
+    if (accumulate) {
+        cvt32s32f_add32f(ihist+1, hist_data, hist_size);
+    } else {
+        std::memset(hist_data, 0, sizeof(float)*hist_size);
+        cvt_32s32f(ihist+1, hist_data, hist_size);
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+}}} // cv::rvv_hal::imgproc
diff --git a/hal/riscv-rvv/hal_rvv_1p0/integral.hpp b/hal/riscv-rvv/src/imgproc/integral.cpp
similarity index 92%
rename from hal/riscv-rvv/hal_rvv_1p0/integral.hpp
rename to hal/riscv-rvv/src/imgproc/integral.cpp
index a3ea0b5557..e0c7f44995 100644
--- a/hal/riscv-rvv/hal_rvv_1p0/integral.hpp
+++ b/hal/riscv-rvv/src/imgproc/integral.cpp
@@ -4,16 +4,13 @@
 
 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
 
-#ifndef OPENCV_HAL_RVV_INTEGRAL_HPP_INCLUDED
-#define OPENCV_HAL_RVV_INTEGRAL_HPP_INCLUDED
+#include "rvv_hal.hpp"
 
-#include <riscv_vector.h>
-#include "types.hpp"
+namespace cv { namespace rvv_hal { namespace imgproc {
 
-namespace cv { namespace cv_hal_rvv {
+#if CV_HAL_RVV_1P0_ENABLED
 
-#undef cv_hal_integral
-#define cv_hal_integral cv::cv_hal_rvv::integral
+namespace {
 
 template <typename vec_t>
 inline typename vec_t::VecType repeat_last_n(typename vec_t::VecType vs, int n, size_t vl) {
@@ -87,6 +84,8 @@ inline int integral(const uchar* src_data, size_t src_step, uchar* sum_data, siz
     return result;
 }
 
+} // anonymous
+
 /**
    @brief Calculate integral image
    @param depth Depth of source image
@@ -119,12 +118,12 @@ inline int integral(const uchar* src_data, size_t src_step, uchar* sum_data, siz
     CV_32F | CV_64F | CV_64F
     CV_64F | CV_64F | CV_64F
 */
-inline int integral(int depth, int sdepth, int sqdepth,
-                    const uchar* src_data, size_t src_step,
-                    uchar* sum_data, size_t sum_step,
-                    uchar* sqsum_data, size_t sqsum_step,
-                    uchar* tilted_data, [[maybe_unused]] size_t tilted_step,
-                    int width, int height, int cn) {
+int integral(int depth, int sdepth, int sqdepth,
+             const uchar* src_data, size_t src_step,
+             uchar* sum_data, size_t sum_step,
+             uchar* sqsum_data, size_t sqsum_step,
+             uchar* tilted_data, [[maybe_unused]] size_t tilted_step,
+             int width, int height, int cn) {
     // tilted sum and cn == 3 cases are not supported
     if (tilted_data || cn == 3) {
         return CV_HAL_ERROR_NOT_IMPLEMENTED;
@@ -168,6 +167,6 @@ inline int integral(int depth, int sdepth, int sqdepth,
     return result;
 }
 
-}}
+#endif // CV_HAL_RVV_1P0_ENABLED
 
-#endif
+}}} // cv::rvv_hal::imgproc
diff --git a/hal/riscv-rvv/src/imgproc/median_blur.cpp b/hal/riscv-rvv/src/imgproc/median_blur.cpp
new file mode 100644
index 0000000000..d86b2d92e3
--- /dev/null
+++ b/hal/riscv-rvv/src/imgproc/median_blur.cpp
@@ -0,0 +1,575 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
+
+#include "rvv_hal.hpp"
+#include "common.hpp"
+
+namespace cv { namespace rvv_hal { namespace imgproc {
+
+#if CV_HAL_RVV_1P0_ENABLED
+
+namespace {
+
+// the algorithm is copied from imgproc/src/median_blur.simd.cpp
+// in the function template static void medianBlur_SortNet
+template<int ksize, typename helper>
+static inline int medianBlurC1(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height)
+{
+    using T = typename helper::ElemType;
+    using VT = typename helper::VecType;
+
+    for (int i = start; i < end; i++)
+    {
+        const T* row0 = reinterpret_cast<const T*>(src_data + std::min(std::max(i     - ksize / 2, 0), height - 1) * src_step);
+        const T* row1 = reinterpret_cast<const T*>(src_data + std::min(std::max(i + 1 - ksize / 2, 0), height - 1) * src_step);
+        const T* row2 = reinterpret_cast<const T*>(src_data + std::min(std::max(i + 2 - ksize / 2, 0), height - 1) * src_step);
+        const T* row3 = reinterpret_cast<const T*>(src_data + std::min(std::max(i + 3 - ksize / 2, 0), height - 1) * src_step);
+        const T* row4 = reinterpret_cast<const T*>(src_data + std::min(std::max(i + 4 - ksize / 2, 0), height - 1) * src_step);
+        int vl;
+        auto vop = [&vl](VT& a, VT& b) {
+            auto t = a;
+            a = helper::vmin(a, b, vl);
+            b = helper::vmax(t, b, vl);
+        };
+
+        for (int j = 0; j < width; j += vl)
+        {
+            vl = helper::setvl(width - j);
+            if (ksize == 3)
+            {
+                VT p0, p1, p2;
+                VT p3, p4, p5;
+                VT p6, p7, p8;
+                if (j != 0)
+                {
+                    p0 = helper::vload(row0 + j - 1, vl);
+                    p3 = helper::vload(row1 + j - 1, vl);
+                    p6 = helper::vload(row2 + j - 1, vl);
+                }
+                else
+                {
+                    p0 = helper::vslide1up(helper::vload(row0, vl), row0[0], vl);
+                    p3 = helper::vslide1up(helper::vload(row1, vl), row1[0], vl);
+                    p6 = helper::vslide1up(helper::vload(row2, vl), row2[0], vl);
+                }
+                p1 = helper::vslide1down(p0, row0[j + vl - 1], vl);
+                p4 = helper::vslide1down(p3, row1[j + vl - 1], vl);
+                p7 = helper::vslide1down(p6, row2[j + vl - 1], vl);
+                p2 = helper::vslide1down(p1, row0[std::min(width - 1, j + vl)], vl);
+                p5 = helper::vslide1down(p4, row1[std::min(width - 1, j + vl)], vl);
+                p8 = helper::vslide1down(p7, row2[std::min(width - 1, j + vl)], vl);
+
+                vop(p1, p2); vop(p4, p5); vop(p7, p8); vop(p0, p1);
+                vop(p3, p4); vop(p6, p7); vop(p1, p2); vop(p4, p5);
+                vop(p7, p8); vop(p0, p3); vop(p5, p8); vop(p4, p7);
+                vop(p3, p6); vop(p1, p4); vop(p2, p5); vop(p4, p7);
+                vop(p4, p2); vop(p6, p4); vop(p4, p2);
+                helper::vstore(reinterpret_cast<T*>(dst_data + i * dst_step) + j, p4, vl);
+            }
+            else
+            {
+                VT p0, p1, p2, p3, p4;
+                VT p5, p6, p7, p8, p9;
+                VT p10, p11, p12, p13, p14;
+                VT p15, p16, p17, p18, p19;
+                VT p20, p21, p22, p23, p24;
+                if (j >= 2)
+                {
+                    p0 = helper::vload(row0 + j - 2, vl);
+                    p5 = helper::vload(row1 + j - 2, vl);
+                    p10 = helper::vload(row2 + j - 2, vl);
+                    p15 = helper::vload(row3 + j - 2, vl);
+                    p20 = helper::vload(row4 + j - 2, vl);
+                }
+                else
+                {
+                    p0 = helper::vslide1up(helper::vload(row0, vl), row0[0], vl);
+                    p5 = helper::vslide1up(helper::vload(row1, vl), row1[0], vl);
+                    p10 = helper::vslide1up(helper::vload(row2, vl), row2[0], vl);
+                    p15 = helper::vslide1up(helper::vload(row3, vl), row3[0], vl);
+                    p20 = helper::vslide1up(helper::vload(row4, vl), row4[0], vl);
+                    if (j == 0)
+                    {
+                        p0 = helper::vslide1up(p0, row0[0], vl);
+                        p5 = helper::vslide1up(p5, row1[0], vl);
+                        p10 = helper::vslide1up(p10, row2[0], vl);
+                        p15 = helper::vslide1up(p15, row3[0], vl);
+                        p20 = helper::vslide1up(p20, row4[0], vl);
+                    }
+                }
+                p1 = helper::vslide1down(p0, row0[j + vl - 2], vl);
+                p6 = helper::vslide1down(p5, row1[j + vl - 2], vl);
+                p11 = helper::vslide1down(p10, row2[j + vl - 2], vl);
+                p16 = helper::vslide1down(p15, row3[j + vl - 2], vl);
+                p21 = helper::vslide1down(p20, row4[j + vl - 2], vl);
+                p2 = helper::vslide1down(p1, row0[j + vl - 1], vl);
+                p7 = helper::vslide1down(p6, row1[j + vl - 1], vl);
+                p12 = helper::vslide1down(p11, row2[j + vl - 1], vl);
+                p17 = helper::vslide1down(p16, row3[j + vl - 1], vl);
+                p22 = helper::vslide1down(p21, row4[j + vl - 1], vl);
+                p3 = helper::vslide1down(p2, row0[std::min(width - 1, j + vl)], vl);
+                p8 = helper::vslide1down(p7, row1[std::min(width - 1, j + vl)], vl);
+                p13 = helper::vslide1down(p12, row2[std::min(width - 1, j + vl)], vl);
+                p18 = helper::vslide1down(p17, row3[std::min(width - 1, j + vl)], vl);
+                p23 = helper::vslide1down(p22, row4[std::min(width - 1, j + vl)], vl);
+                p4 = helper::vslide1down(p3, row0[std::min(width - 1, j + vl + 1)], vl);
+                p9 = helper::vslide1down(p8, row1[std::min(width - 1, j + vl + 1)], vl);
+                p14 = helper::vslide1down(p13, row2[std::min(width - 1, j + vl + 1)], vl);
+                p19 = helper::vslide1down(p18, row3[std::min(width - 1, j + vl + 1)], vl);
+                p24 = helper::vslide1down(p23, row4[std::min(width - 1, j + vl + 1)], vl);
+
+                vop(p1, p2); vop(p0, p1); vop(p1, p2); vop(p4, p5); vop(p3, p4);
+                vop(p4, p5); vop(p0, p3); vop(p2, p5); vop(p2, p3); vop(p1, p4);
+                vop(p1, p2); vop(p3, p4); vop(p7, p8); vop(p6, p7); vop(p7, p8);
+                vop(p10, p11); vop(p9, p10); vop(p10, p11); vop(p6, p9); vop(p8, p11);
+                vop(p8, p9); vop(p7, p10); vop(p7, p8); vop(p9, p10); vop(p0, p6);
+                vop(p4, p10); vop(p4, p6); vop(p2, p8); vop(p2, p4); vop(p6, p8);
+                vop(p1, p7); vop(p5, p11); vop(p5, p7); vop(p3, p9); vop(p3, p5);
+                vop(p7, p9); vop(p1, p2); vop(p3, p4); vop(p5, p6); vop(p7, p8);
+                vop(p9, p10); vop(p13, p14); vop(p12, p13); vop(p13, p14); vop(p16, p17);
+                vop(p15, p16); vop(p16, p17); vop(p12, p15); vop(p14, p17); vop(p14, p15);
+                vop(p13, p16); vop(p13, p14); vop(p15, p16); vop(p19, p20); vop(p18, p19);
+                vop(p19, p20); vop(p21, p22); vop(p23, p24); vop(p21, p23); vop(p22, p24);
+                vop(p22, p23); vop(p18, p21); vop(p20, p23); vop(p20, p21); vop(p19, p22);
+                vop(p22, p24); vop(p19, p20); vop(p21, p22); vop(p23, p24); vop(p12, p18);
+                vop(p16, p22); vop(p16, p18); vop(p14, p20); vop(p20, p24); vop(p14, p16);
+                vop(p18, p20); vop(p22, p24); vop(p13, p19); vop(p17, p23); vop(p17, p19);
+                vop(p15, p21); vop(p15, p17); vop(p19, p21); vop(p13, p14); vop(p15, p16);
+                vop(p17, p18); vop(p19, p20); vop(p21, p22); vop(p23, p24); vop(p0, p12);
+                vop(p8, p20); vop(p8, p12); vop(p4, p16); vop(p16, p24); vop(p12, p16);
+                vop(p2, p14); vop(p10, p22); vop(p10, p14); vop(p6, p18); vop(p6, p10);
+                vop(p10, p12); vop(p1, p13); vop(p9, p21); vop(p9, p13); vop(p5, p17);
+                vop(p13, p17); vop(p3, p15); vop(p11, p23); vop(p11, p15); vop(p7, p19);
+                vop(p7, p11); vop(p11, p13); vop(p11, p12);
+                helper::vstore(reinterpret_cast<T*>(dst_data + i * dst_step) + j, p12, vl);
+            }
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+template<int ksize>
+static inline int medianBlurC4(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height)
+{
+    for (int i = start; i < end; i++)
+    {
+        const uchar* row0 = src_data + std::min(std::max(i     - ksize / 2, 0), height - 1) * src_step;
+        const uchar* row1 = src_data + std::min(std::max(i + 1 - ksize / 2, 0), height - 1) * src_step;
+        const uchar* row2 = src_data + std::min(std::max(i + 2 - ksize / 2, 0), height - 1) * src_step;
+        const uchar* row3 = src_data + std::min(std::max(i + 3 - ksize / 2, 0), height - 1) * src_step;
+        const uchar* row4 = src_data + std::min(std::max(i + 4 - ksize / 2, 0), height - 1) * src_step;
+        int vl;
+        for (int j = 0; j < width; j += vl)
+        {
+            if (ksize == 3)
+            {
+                vl = __riscv_vsetvl_e8m1(width - j);
+                vuint8m1_t p00, p01, p02;
+                vuint8m1_t p03, p04, p05;
+                vuint8m1_t p06, p07, p08;
+                vuint8m1_t p10, p11, p12;
+                vuint8m1_t p13, p14, p15;
+                vuint8m1_t p16, p17, p18;
+                vuint8m1_t p20, p21, p22;
+                vuint8m1_t p23, p24, p25;
+                vuint8m1_t p26, p27, p28;
+                vuint8m1_t p30, p31, p32;
+                vuint8m1_t p33, p34, p35;
+                vuint8m1_t p36, p37, p38;
+                auto loadsrc = [&vl](const uchar* row, vuint8m1_t& p0, vuint8m1_t& p1, vuint8m1_t& p2, vuint8m1_t& p3) {
+                    auto src = __riscv_vlseg4e8_v_u8m1x4(row, vl);
+                    p0 = __riscv_vget_v_u8m1x4_u8m1(src, 0);
+                    p1 = __riscv_vget_v_u8m1x4_u8m1(src, 1);
+                    p2 = __riscv_vget_v_u8m1x4_u8m1(src, 2);
+                    p3 = __riscv_vget_v_u8m1x4_u8m1(src, 3);
+                };
+                if (j != 0)
+                {
+                    loadsrc(row0 + (j - 1) * 4, p00, p10, p20, p30);
+                    loadsrc(row1 + (j - 1) * 4, p03, p13, p23, p33);
+                    loadsrc(row2 + (j - 1) * 4, p06, p16, p26, p36);
+                }
+                else
+                {
+                    loadsrc(row0, p00, p10, p20, p30);
+                    loadsrc(row1, p03, p13, p23, p33);
+                    loadsrc(row2, p06, p16, p26, p36);
+                    p00 = __riscv_vslide1up(p00, row0[0], vl);
+                    p10 = __riscv_vslide1up(p10, row0[1], vl);
+                    p20 = __riscv_vslide1up(p20, row0[2], vl);
+                    p30 = __riscv_vslide1up(p30, row0[3], vl);
+                    p03 = __riscv_vslide1up(p03, row1[0], vl);
+                    p13 = __riscv_vslide1up(p13, row1[1], vl);
+                    p23 = __riscv_vslide1up(p23, row1[2], vl);
+                    p33 = __riscv_vslide1up(p33, row1[3], vl);
+                    p06 = __riscv_vslide1up(p06, row2[0], vl);
+                    p16 = __riscv_vslide1up(p16, row2[1], vl);
+                    p26 = __riscv_vslide1up(p26, row2[2], vl);
+                    p36 = __riscv_vslide1up(p36, row2[3], vl);
+                }
+                p01 = __riscv_vslide1down(p00, row0[(j + vl - 1) * 4    ], vl);
+                p11 = __riscv_vslide1down(p10, row0[(j + vl - 1) * 4 + 1], vl);
+                p21 = __riscv_vslide1down(p20, row0[(j + vl - 1) * 4 + 2], vl);
+                p31 = __riscv_vslide1down(p30, row0[(j + vl - 1) * 4 + 3], vl);
+                p04 = __riscv_vslide1down(p03, row1[(j + vl - 1) * 4    ], vl);
+                p14 = __riscv_vslide1down(p13, row1[(j + vl - 1) * 4 + 1], vl);
+                p24 = __riscv_vslide1down(p23, row1[(j + vl - 1) * 4 + 2], vl);
+                p34 = __riscv_vslide1down(p33, row1[(j + vl - 1) * 4 + 3], vl);
+                p07 = __riscv_vslide1down(p06, row2[(j + vl - 1) * 4    ], vl);
+                p17 = __riscv_vslide1down(p16, row2[(j + vl - 1) * 4 + 1], vl);
+                p27 = __riscv_vslide1down(p26, row2[(j + vl - 1) * 4 + 2], vl);
+                p37 = __riscv_vslide1down(p36, row2[(j + vl - 1) * 4 + 3], vl);
+                p02 = __riscv_vslide1down(p01, row0[std::min(width - 1, j + vl) * 4    ], vl);
+                p12 = __riscv_vslide1down(p11, row0[std::min(width - 1, j + vl) * 4 + 1], vl);
+                p22 = __riscv_vslide1down(p21, row0[std::min(width - 1, j + vl) * 4 + 2], vl);
+                p32 = __riscv_vslide1down(p31, row0[std::min(width - 1, j + vl) * 4 + 3], vl);
+                p05 = __riscv_vslide1down(p04, row1[std::min(width - 1, j + vl) * 4    ], vl);
+                p15 = __riscv_vslide1down(p14, row1[std::min(width - 1, j + vl) * 4 + 1], vl);
+                p25 = __riscv_vslide1down(p24, row1[std::min(width - 1, j + vl) * 4 + 2], vl);
+                p35 = __riscv_vslide1down(p34, row1[std::min(width - 1, j + vl) * 4 + 3], vl);
+                p08 = __riscv_vslide1down(p07, row2[std::min(width - 1, j + vl) * 4    ], vl);
+                p18 = __riscv_vslide1down(p17, row2[std::min(width - 1, j + vl) * 4 + 1], vl);
+                p28 = __riscv_vslide1down(p27, row2[std::min(width - 1, j + vl) * 4 + 2], vl);
+                p38 = __riscv_vslide1down(p37, row2[std::min(width - 1, j + vl) * 4 + 3], vl);
+
+                auto vop = [&vl](vuint8m1_t& a, vuint8m1_t& b) {
+                    auto t = a;
+                    a = __riscv_vminu(a, b, vl);
+                    b = __riscv_vmaxu(t, b, vl);
+                };
+                vuint8m1x4_t dst{};
+                vop(p01, p02); vop(p04, p05); vop(p07, p08); vop(p00, p01);
+                vop(p03, p04); vop(p06, p07); vop(p01, p02); vop(p04, p05);
+                vop(p07, p08); vop(p00, p03); vop(p05, p08); vop(p04, p07);
+                vop(p03, p06); vop(p01, p04); vop(p02, p05); vop(p04, p07);
+                vop(p04, p02); vop(p06, p04); vop(p04, p02);
+                dst = __riscv_vset_v_u8m1_u8m1x4(dst, 0, p04);
+                vop(p11, p12); vop(p14, p15); vop(p17, p18); vop(p10, p11);
+                vop(p13, p14); vop(p16, p17); vop(p11, p12); vop(p14, p15);
+                vop(p17, p18); vop(p10, p13); vop(p15, p18); vop(p14, p17);
+                vop(p13, p16); vop(p11, p14); vop(p12, p15); vop(p14, p17);
+                vop(p14, p12); vop(p16, p14); vop(p14, p12);
+                dst = __riscv_vset_v_u8m1_u8m1x4(dst, 1, p14);
+                vop(p21, p22); vop(p24, p25); vop(p27, p28); vop(p20, p21);
+                vop(p23, p24); vop(p26, p27); vop(p21, p22); vop(p24, p25);
+                vop(p27, p28); vop(p20, p23); vop(p25, p28); vop(p24, p27);
+                vop(p23, p26); vop(p21, p24); vop(p22, p25); vop(p24, p27);
+                vop(p24, p22); vop(p26, p24); vop(p24, p22);
+                dst = __riscv_vset_v_u8m1_u8m1x4(dst, 2, p24);
+                vop(p31, p32); vop(p34, p35); vop(p37, p38); vop(p30, p31);
+                vop(p33, p34); vop(p36, p37); vop(p31, p32); vop(p34, p35);
+                vop(p37, p38); vop(p30, p33); vop(p35, p38); vop(p34, p37);
+                vop(p33, p36); vop(p31, p34); vop(p32, p35); vop(p34, p37);
+                vop(p34, p32); vop(p36, p34); vop(p34, p32);
+                dst = __riscv_vset_v_u8m1_u8m1x4(dst, 3, p34);
+                __riscv_vsseg4e8(dst_data + i * dst_step + j * 4, dst, vl);
+            }
+            else
+            {
+                vl = __riscv_vsetvl_e8m2(width - j);
+                vuint8m2_t p00, p01, p02, p03, p04;
+                vuint8m2_t p05, p06, p07, p08, p09;
+                vuint8m2_t p010, p011, p012, p013, p014;
+                vuint8m2_t p015, p016, p017, p018, p019;
+                vuint8m2_t p020, p021, p022, p023, p024;
+                vuint8m2_t p10, p11, p12, p13, p14;
+                vuint8m2_t p15, p16, p17, p18, p19;
+                vuint8m2_t p110, p111, p112, p113, p114;
+                vuint8m2_t p115, p116, p117, p118, p119;
+                vuint8m2_t p120, p121, p122, p123, p124;
+                vuint8m2_t p20, p21, p22, p23, p24;
+                vuint8m2_t p25, p26, p27, p28, p29;
+                vuint8m2_t p210, p211, p212, p213, p214;
+                vuint8m2_t p215, p216, p217, p218, p219;
+                vuint8m2_t p220, p221, p222, p223, p224;
+                vuint8m2_t p30, p31, p32, p33, p34;
+                vuint8m2_t p35, p36, p37, p38, p39;
+                vuint8m2_t p310, p311, p312, p313, p314;
+                vuint8m2_t p315, p316, p317, p318, p319;
+                vuint8m2_t p320, p321, p322, p323, p324;
+                auto loadsrc = [&vl](const uchar* row, vuint8m2_t& p0, vuint8m2_t& p1, vuint8m2_t& p2, vuint8m2_t& p3) {
+                    auto src = __riscv_vlseg4e8_v_u8m2x4(row, vl);
+                    p0 = __riscv_vget_v_u8m2x4_u8m2(src, 0);
+                    p1 = __riscv_vget_v_u8m2x4_u8m2(src, 1);
+                    p2 = __riscv_vget_v_u8m2x4_u8m2(src, 2);
+                    p3 = __riscv_vget_v_u8m2x4_u8m2(src, 3);
+                };
+                if (j >= 2)
+                {
+                    loadsrc(row0 + (j - 2) * 4, p00, p10, p20, p30);
+                    loadsrc(row1 + (j - 2) * 4, p05, p15, p25, p35);
+                    loadsrc(row2 + (j - 2) * 4, p010, p110, p210, p310);
+                    loadsrc(row3 + (j - 2) * 4, p015, p115, p215, p315);
+                    loadsrc(row4 + (j - 2) * 4, p020, p120, p220, p320);
+                }
+                else
+                {
+                    loadsrc(row0, p00, p10, p20, p30);
+                    loadsrc(row1, p05, p15, p25, p35);
+                    loadsrc(row2, p010, p110, p210, p310);
+                    loadsrc(row3, p015, p115, p215, p315);
+                    loadsrc(row4, p020, p120, p220, p320);
+                    auto slideup = [&] {
+                        p00 = __riscv_vslide1up(p00, row0[0], vl);
+                        p10 = __riscv_vslide1up(p10, row0[1], vl);
+                        p20 = __riscv_vslide1up(p20, row0[2], vl);
+                        p30 = __riscv_vslide1up(p30, row0[3], vl);
+                        p05 = __riscv_vslide1up(p05, row1[0], vl);
+                        p15 = __riscv_vslide1up(p15, row1[1], vl);
+                        p25 = __riscv_vslide1up(p25, row1[2], vl);
+                        p35 = __riscv_vslide1up(p35, row1[3], vl);
+                        p010 = __riscv_vslide1up(p010, row2[0], vl);
+                        p110 = __riscv_vslide1up(p110, row2[1], vl);
+                        p210 = __riscv_vslide1up(p210, row2[2], vl);
+                        p310 = __riscv_vslide1up(p310, row2[3], vl);
+                        p015 = __riscv_vslide1up(p015, row3[0], vl);
+                        p115 = __riscv_vslide1up(p115, row3[1], vl);
+                        p215 = __riscv_vslide1up(p215, row3[2], vl);
+                        p315 = __riscv_vslide1up(p315, row3[3], vl);
+                        p020 = __riscv_vslide1up(p020, row4[0], vl);
+                        p120 = __riscv_vslide1up(p120, row4[1], vl);
+                        p220 = __riscv_vslide1up(p220, row4[2], vl);
+                        p320 = __riscv_vslide1up(p320, row4[3], vl);
+                    };
+                    slideup();
+                    if (j == 0)
+                    {
+                        slideup();
+                    }
+                }
+                p01 = __riscv_vslide1down(p00, row0[(j + vl - 2) * 4    ], vl);
+                p11 = __riscv_vslide1down(p10, row0[(j + vl - 2) * 4 + 1], vl);
+                p21 = __riscv_vslide1down(p20, row0[(j + vl - 2) * 4 + 2], vl);
+                p31 = __riscv_vslide1down(p30, row0[(j + vl - 2) * 4 + 3], vl);
+                p06 = __riscv_vslide1down(p05, row1[(j + vl - 2) * 4    ], vl);
+                p16 = __riscv_vslide1down(p15, row1[(j + vl - 2) * 4 + 1], vl);
+                p26 = __riscv_vslide1down(p25, row1[(j + vl - 2) * 4 + 2], vl);
+                p36 = __riscv_vslide1down(p35, row1[(j + vl - 2) * 4 + 3], vl);
+                p011 = __riscv_vslide1down(p010, row2[(j + vl - 2) * 4    ], vl);
+                p111 = __riscv_vslide1down(p110, row2[(j + vl - 2) * 4 + 1], vl);
+                p211 = __riscv_vslide1down(p210, row2[(j + vl - 2) * 4 + 2], vl);
+                p311 = __riscv_vslide1down(p310, row2[(j + vl - 2) * 4 + 3], vl);
+                p016 = __riscv_vslide1down(p015, row3[(j + vl - 2) * 4    ], vl);
+                p116 = __riscv_vslide1down(p115, row3[(j + vl - 2) * 4 + 1], vl);
+                p216 = __riscv_vslide1down(p215, row3[(j + vl - 2) * 4 + 2], vl);
+                p316 = __riscv_vslide1down(p315, row3[(j + vl - 2) * 4 + 3], vl);
+                p021 = __riscv_vslide1down(p020, row4[(j + vl - 2) * 4    ], vl);
+                p121 = __riscv_vslide1down(p120, row4[(j + vl - 2) * 4 + 1], vl);
+                p221 = __riscv_vslide1down(p220, row4[(j + vl - 2) * 4 + 2], vl);
+                p321 = __riscv_vslide1down(p320, row4[(j + vl - 2) * 4 + 3], vl);
+                p02 = __riscv_vslide1down(p01, row0[(j + vl - 1) * 4    ], vl);
+                p12 = __riscv_vslide1down(p11, row0[(j + vl - 1) * 4 + 1], vl);
+                p22 = __riscv_vslide1down(p21, row0[(j + vl - 1) * 4 + 2], vl);
+                p32 = __riscv_vslide1down(p31, row0[(j + vl - 1) * 4 + 3], vl);
+                p07 = __riscv_vslide1down(p06, row1[(j + vl - 1) * 4    ], vl);
+                p17 = __riscv_vslide1down(p16, row1[(j + vl - 1) * 4 + 1], vl);
+                p27 = __riscv_vslide1down(p26, row1[(j + vl - 1) * 4 + 2], vl);
+                p37 = __riscv_vslide1down(p36, row1[(j + vl - 1) * 4 + 3], vl);
+                p012 = __riscv_vslide1down(p011, row2[(j + vl - 1) * 4    ], vl);
+                p112 = __riscv_vslide1down(p111, row2[(j + vl - 1) * 4 + 1], vl);
+                p212 = __riscv_vslide1down(p211, row2[(j + vl - 1) * 4 + 2], vl);
+                p312 = __riscv_vslide1down(p311, row2[(j + vl - 1) * 4 + 3], vl);
+                p017 = __riscv_vslide1down(p016, row3[(j + vl - 1) * 4    ], vl);
+                p117 = __riscv_vslide1down(p116, row3[(j + vl - 1) * 4 + 1], vl);
+                p217 = __riscv_vslide1down(p216, row3[(j + vl - 1) * 4 + 2], vl);
+                p317 = __riscv_vslide1down(p316, row3[(j + vl - 1) * 4 + 3], vl);
+                p022 = __riscv_vslide1down(p021, row4[(j + vl - 1) * 4    ], vl);
+                p122 = __riscv_vslide1down(p121, row4[(j + vl - 1) * 4 + 1], vl);
+                p222 = __riscv_vslide1down(p221, row4[(j + vl - 1) * 4 + 2], vl);
+                p322 = __riscv_vslide1down(p321, row4[(j + vl - 1) * 4 + 3], vl);
+                p03 = __riscv_vslide1down(p02, row0[std::min(width - 1, j + vl) * 4    ], vl);
+                p13 = __riscv_vslide1down(p12, row0[std::min(width - 1, j + vl) * 4 + 1], vl);
+                p23 = __riscv_vslide1down(p22, row0[std::min(width - 1, j + vl) * 4 + 2], vl);
+                p33 = __riscv_vslide1down(p32, row0[std::min(width - 1, j + vl) * 4 + 3], vl);
+                p08 = __riscv_vslide1down(p07, row1[std::min(width - 1, j + vl) * 4    ], vl);
+                p18 = __riscv_vslide1down(p17, row1[std::min(width - 1, j + vl) * 4 + 1], vl);
+                p28 = __riscv_vslide1down(p27, row1[std::min(width - 1, j + vl) * 4 + 2], vl);
+                p38 = __riscv_vslide1down(p37, row1[std::min(width - 1, j + vl) * 4 + 3], vl);
+                p013 = __riscv_vslide1down(p012, row2[std::min(width - 1, j + vl) * 4    ], vl);
+                p113 = __riscv_vslide1down(p112, row2[std::min(width - 1, j + vl) * 4 + 1], vl);
+                p213 = __riscv_vslide1down(p212, row2[std::min(width - 1, j + vl) * 4 + 2], vl);
+                p313 = __riscv_vslide1down(p312, row2[std::min(width - 1, j + vl) * 4 + 3], vl);
+                p018 = __riscv_vslide1down(p017, row3[std::min(width - 1, j + vl) * 4    ], vl);
+                p118 = __riscv_vslide1down(p117, row3[std::min(width - 1, j + vl) * 4 + 1], vl);
+                p218 = __riscv_vslide1down(p217, row3[std::min(width - 1, j + vl) * 4 + 2], vl);
+                p318 = __riscv_vslide1down(p317, row3[std::min(width - 1, j + vl) * 4 + 3], vl);
+                p023 = __riscv_vslide1down(p022, row4[std::min(width - 1, j + vl) * 4    ], vl);
+                p123 = __riscv_vslide1down(p122, row4[std::min(width - 1, j + vl) * 4 + 1], vl);
+                p223 = __riscv_vslide1down(p222, row4[std::min(width - 1, j + vl) * 4 + 2], vl);
+                p323 = __riscv_vslide1down(p322, row4[std::min(width - 1, j + vl) * 4 + 3], vl);
+                p04 = __riscv_vslide1down(p03, row0[std::min(width - 1, j + vl + 1) * 4    ], vl);
+                p14 = __riscv_vslide1down(p13, row0[std::min(width - 1, j + vl + 1) * 4 + 1], vl);
+                p24 = __riscv_vslide1down(p23, row0[std::min(width - 1, j + vl + 1) * 4 + 2], vl);
+                p34 = __riscv_vslide1down(p33, row0[std::min(width - 1, j + vl + 1) * 4 + 3], vl);
+                p09 = __riscv_vslide1down(p08, row1[std::min(width - 1, j + vl + 1) * 4    ], vl);
+                p19 = __riscv_vslide1down(p18, row1[std::min(width - 1, j + vl + 1) * 4 + 1], vl);
+                p29 = __riscv_vslide1down(p28, row1[std::min(width - 1, j + vl + 1) * 4 + 2], vl);
+                p39 = __riscv_vslide1down(p38, row1[std::min(width - 1, j + vl + 1) * 4 + 3], vl);
+                p014 = __riscv_vslide1down(p013, row2[std::min(width - 1, j + vl + 1) * 4    ], vl);
+                p114 = __riscv_vslide1down(p113, row2[std::min(width - 1, j + vl + 1) * 4 + 1], vl);
+                p214 = __riscv_vslide1down(p213, row2[std::min(width - 1, j + vl + 1) * 4 + 2], vl);
+                p314 = __riscv_vslide1down(p313, row2[std::min(width - 1, j + vl + 1) * 4 + 3], vl);
+                p019 = __riscv_vslide1down(p018, row3[std::min(width - 1, j + vl + 1) * 4    ], vl);
+                p119 = __riscv_vslide1down(p118, row3[std::min(width - 1, j + vl + 1) * 4 + 1], vl);
+                p219 = __riscv_vslide1down(p218, row3[std::min(width - 1, j + vl + 1) * 4 + 2], vl);
+                p319 = __riscv_vslide1down(p318, row3[std::min(width - 1, j + vl + 1) * 4 + 3], vl);
+                p024 = __riscv_vslide1down(p023, row4[std::min(width - 1, j + vl + 1) * 4    ], vl);
+                p124 = __riscv_vslide1down(p123, row4[std::min(width - 1, j + vl + 1) * 4 + 1], vl);
+                p224 = __riscv_vslide1down(p223, row4[std::min(width - 1, j + vl + 1) * 4 + 2], vl);
+                p324 = __riscv_vslide1down(p323, row4[std::min(width - 1, j + vl + 1) * 4 + 3], vl);
+
+                auto vop = [&vl](vuint8m2_t& a, vuint8m2_t& b) {
+                    auto t = a;
+                    a = __riscv_vminu(a, b, vl);
+                    b = __riscv_vmaxu(t, b, vl);
+                };
+                vuint8m2x4_t dst{};
+                vop(p01, p02); vop(p00, p01); vop(p01, p02); vop(p04, p05); vop(p03, p04);
+                vop(p04, p05); vop(p00, p03); vop(p02, p05); vop(p02, p03); vop(p01, p04);
+                vop(p01, p02); vop(p03, p04); vop(p07, p08); vop(p06, p07); vop(p07, p08);
+                vop(p010, p011); vop(p09, p010); vop(p010, p011); vop(p06, p09); vop(p08, p011);
+                vop(p08, p09); vop(p07, p010); vop(p07, p08); vop(p09, p010); vop(p00, p06);
+                vop(p04, p010); vop(p04, p06); vop(p02, p08); vop(p02, p04); vop(p06, p08);
+                vop(p01, p07); vop(p05, p011); vop(p05, p07); vop(p03, p09); vop(p03, p05);
+                vop(p07, p09); vop(p01, p02); vop(p03, p04); vop(p05, p06); vop(p07, p08);
+                vop(p09, p010); vop(p013, p014); vop(p012, p013); vop(p013, p014); vop(p016, p017);
+                vop(p015, p016); vop(p016, p017); vop(p012, p015); vop(p014, p017); vop(p014, p015);
+                vop(p013, p016); vop(p013, p014); vop(p015, p016); vop(p019, p020); vop(p018, p019);
+                vop(p019, p020); vop(p021, p022); vop(p023, p024); vop(p021, p023); vop(p022, p024);
+                vop(p022, p023); vop(p018, p021); vop(p020, p023); vop(p020, p021); vop(p019, p022);
+                vop(p022, p024); vop(p019, p020); vop(p021, p022); vop(p023, p024); vop(p012, p018);
+                vop(p016, p022); vop(p016, p018); vop(p014, p020); vop(p020, p024); vop(p014, p016);
+                vop(p018, p020); vop(p022, p024); vop(p013, p019); vop(p017, p023); vop(p017, p019);
+                vop(p015, p021); vop(p015, p017); vop(p019, p021); vop(p013, p014); vop(p015, p016);
+                vop(p017, p018); vop(p019, p020); vop(p021, p022); vop(p023, p024); vop(p00, p012);
+                vop(p08, p020); vop(p08, p012); vop(p04, p016); vop(p016, p024); vop(p012, p016);
+                vop(p02, p014); vop(p010, p022); vop(p010, p014); vop(p06, p018); vop(p06, p010);
+                vop(p010, p012); vop(p01, p013); vop(p09, p021); vop(p09, p013); vop(p05, p017);
+                vop(p013, p017); vop(p03, p015); vop(p011, p023); vop(p011, p015); vop(p07, p019);
+                vop(p07, p011); vop(p011, p013); vop(p011, p012);
+                dst = __riscv_vset_v_u8m2_u8m2x4(dst, 0, p012);
+                vop(p11, p12); vop(p10, p11); vop(p11, p12); vop(p14, p15); vop(p13, p14);
+                vop(p14, p15); vop(p10, p13); vop(p12, p15); vop(p12, p13); vop(p11, p14);
+                vop(p11, p12); vop(p13, p14); vop(p17, p18); vop(p16, p17); vop(p17, p18);
+                vop(p110, p111); vop(p19, p110); vop(p110, p111); vop(p16, p19); vop(p18, p111);
+                vop(p18, p19); vop(p17, p110); vop(p17, p18); vop(p19, p110); vop(p10, p16);
+                vop(p14, p110); vop(p14, p16); vop(p12, p18); vop(p12, p14); vop(p16, p18);
+                vop(p11, p17); vop(p15, p111); vop(p15, p17); vop(p13, p19); vop(p13, p15);
+                vop(p17, p19); vop(p11, p12); vop(p13, p14); vop(p15, p16); vop(p17, p18);
+                vop(p19, p110); vop(p113, p114); vop(p112, p113); vop(p113, p114); vop(p116, p117);
+                vop(p115, p116); vop(p116, p117); vop(p112, p115); vop(p114, p117); vop(p114, p115);
+                vop(p113, p116); vop(p113, p114); vop(p115, p116); vop(p119, p120); vop(p118, p119);
+                vop(p119, p120); vop(p121, p122); vop(p123, p124); vop(p121, p123); vop(p122, p124);
+                vop(p122, p123); vop(p118, p121); vop(p120, p123); vop(p120, p121); vop(p119, p122);
+                vop(p122, p124); vop(p119, p120); vop(p121, p122); vop(p123, p124); vop(p112, p118);
+                vop(p116, p122); vop(p116, p118); vop(p114, p120); vop(p120, p124); vop(p114, p116);
+                vop(p118, p120); vop(p122, p124); vop(p113, p119); vop(p117, p123); vop(p117, p119);
+                vop(p115, p121); vop(p115, p117); vop(p119, p121); vop(p113, p114); vop(p115, p116);
+                vop(p117, p118); vop(p119, p120); vop(p121, p122); vop(p123, p124); vop(p10, p112);
+                vop(p18, p120); vop(p18, p112); vop(p14, p116); vop(p116, p124); vop(p112, p116);
+                vop(p12, p114); vop(p110, p122); vop(p110, p114); vop(p16, p118); vop(p16, p110);
+                vop(p110, p112); vop(p11, p113); vop(p19, p121); vop(p19, p113); vop(p15, p117);
+                vop(p113, p117); vop(p13, p115); vop(p111, p123); vop(p111, p115); vop(p17, p119);
+                vop(p17, p111); vop(p111, p113); vop(p111, p112);
+                dst = __riscv_vset_v_u8m2_u8m2x4(dst, 1, p112);
+                vop(p21, p22); vop(p20, p21); vop(p21, p22); vop(p24, p25); vop(p23, p24);
+                vop(p24, p25); vop(p20, p23); vop(p22, p25); vop(p22, p23); vop(p21, p24);
+                vop(p21, p22); vop(p23, p24); vop(p27, p28); vop(p26, p27); vop(p27, p28);
+                vop(p210, p211); vop(p29, p210); vop(p210, p211); vop(p26, p29); vop(p28, p211);
+                vop(p28, p29); vop(p27, p210); vop(p27, p28); vop(p29, p210); vop(p20, p26);
+                vop(p24, p210); vop(p24, p26); vop(p22, p28); vop(p22, p24); vop(p26, p28);
+                vop(p21, p27); vop(p25, p211); vop(p25, p27); vop(p23, p29); vop(p23, p25);
+                vop(p27, p29); vop(p21, p22); vop(p23, p24); vop(p25, p26); vop(p27, p28);
+                vop(p29, p210); vop(p213, p214); vop(p212, p213); vop(p213, p214); vop(p216, p217);
+                vop(p215, p216); vop(p216, p217); vop(p212, p215); vop(p214, p217); vop(p214, p215);
+                vop(p213, p216); vop(p213, p214); vop(p215, p216); vop(p219, p220); vop(p218, p219);
+                vop(p219, p220); vop(p221, p222); vop(p223, p224); vop(p221, p223); vop(p222, p224);
+                vop(p222, p223); vop(p218, p221); vop(p220, p223); vop(p220, p221); vop(p219, p222);
+                vop(p222, p224); vop(p219, p220); vop(p221, p222); vop(p223, p224); vop(p212, p218);
+                vop(p216, p222); vop(p216, p218); vop(p214, p220); vop(p220, p224); vop(p214, p216);
+                vop(p218, p220); vop(p222, p224); vop(p213, p219); vop(p217, p223); vop(p217, p219);
+                vop(p215, p221); vop(p215, p217); vop(p219, p221); vop(p213, p214); vop(p215, p216);
+                vop(p217, p218); vop(p219, p220); vop(p221, p222); vop(p223, p224); vop(p20, p212);
+                vop(p28, p220); vop(p28, p212); vop(p24, p216); vop(p216, p224); vop(p212, p216);
+                vop(p22, p214); vop(p210, p222); vop(p210, p214); vop(p26, p218); vop(p26, p210);
+                vop(p210, p212); vop(p21, p213); vop(p29, p221); vop(p29, p213); vop(p25, p217);
+                vop(p213, p217); vop(p23, p215); vop(p211, p223); vop(p211, p215); vop(p27, p219);
+                vop(p27, p211); vop(p211, p213); vop(p211, p212);
+                dst = __riscv_vset_v_u8m2_u8m2x4(dst, 2, p212);
+                vop(p31, p32); vop(p30, p31); vop(p31, p32); vop(p34, p35); vop(p33, p34);
+                vop(p34, p35); vop(p30, p33); vop(p32, p35); vop(p32, p33); vop(p31, p34);
+                vop(p31, p32); vop(p33, p34); vop(p37, p38); vop(p36, p37); vop(p37, p38);
+                vop(p310, p311); vop(p39, p310); vop(p310, p311); vop(p36, p39); vop(p38, p311);
+                vop(p38, p39); vop(p37, p310); vop(p37, p38); vop(p39, p310); vop(p30, p36);
+                vop(p34, p310); vop(p34, p36); vop(p32, p38); vop(p32, p34); vop(p36, p38);
+                vop(p31, p37); vop(p35, p311); vop(p35, p37); vop(p33, p39); vop(p33, p35);
+                vop(p37, p39); vop(p31, p32); vop(p33, p34); vop(p35, p36); vop(p37, p38);
+                vop(p39, p310); vop(p313, p314); vop(p312, p313); vop(p313, p314); vop(p316, p317);
+                vop(p315, p316); vop(p316, p317); vop(p312, p315); vop(p314, p317); vop(p314, p315);
+                vop(p313, p316); vop(p313, p314); vop(p315, p316); vop(p319, p320); vop(p318, p319);
+                vop(p319, p320); vop(p321, p322); vop(p323, p324); vop(p321, p323); vop(p322, p324);
+                vop(p322, p323); vop(p318, p321); vop(p320, p323); vop(p320, p321); vop(p319, p322);
+                vop(p322, p324); vop(p319, p320); vop(p321, p322); vop(p323, p324); vop(p312, p318);
+                vop(p316, p322); vop(p316, p318); vop(p314, p320); vop(p320, p324); vop(p314, p316);
+                vop(p318, p320); vop(p322, p324); vop(p313, p319); vop(p317, p323); vop(p317, p319);
+                vop(p315, p321); vop(p315, p317); vop(p319, p321); vop(p313, p314); vop(p315, p316);
+                vop(p317, p318); vop(p319, p320); vop(p321, p322); vop(p323, p324); vop(p30, p312);
+                vop(p38, p320); vop(p38, p312); vop(p34, p316); vop(p316, p324); vop(p312, p316);
+                vop(p32, p314); vop(p310, p322); vop(p310, p314); vop(p36, p318); vop(p36, p310);
+                vop(p310, p312); vop(p31, p313); vop(p39, p321); vop(p39, p313); vop(p35, p317);
+                vop(p313, p317); vop(p33, p315); vop(p311, p323); vop(p311, p315); vop(p37, p319);
+                vop(p37, p311); vop(p311, p313); vop(p311, p312);
+                dst = __riscv_vset_v_u8m2_u8m2x4(dst, 3, p312);
+                __riscv_vsseg4e8(dst_data + i * dst_step + j * 4, dst, vl);
+            }
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+} // anonymous
+
+int medianBlur(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int depth, int cn, int ksize)
+{
+    const int type = CV_MAKETYPE(depth, cn);
+    if (type != CV_8UC1 && type != CV_8UC4 && type != CV_16UC1 && type != CV_16SC1 && type != CV_32FC1)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    if ((ksize != 3 && ksize != 5) || src_data == dst_data)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    switch (ksize*100 + type)
+    {
+    case 300 + CV_8UC1:
+        return common::invoke(height, {medianBlurC1<3, RVV_U8M4>}, src_data, src_step, dst_data, dst_step, width, height);
+    case 300 + CV_16UC1:
+        return common::invoke(height, {medianBlurC1<3, RVV_U16M4>}, src_data, src_step, dst_data, dst_step, width, height);
+    case 300 + CV_16SC1:
+        return common::invoke(height, {medianBlurC1<3, RVV_I16M4>}, src_data, src_step, dst_data, dst_step, width, height);
+    case 300 + CV_32FC1:
+        return common::invoke(height, {medianBlurC1<3, RVV_F32M4>}, src_data, src_step, dst_data, dst_step, width, height);
+    case 500 + CV_8UC1:
+        return common::invoke(height, {medianBlurC1<5, RVV_U8M1>}, src_data, src_step, dst_data, dst_step, width, height);
+    case 500 + CV_16UC1:
+        return common::invoke(height, {medianBlurC1<5, RVV_U16M1>}, src_data, src_step, dst_data, dst_step, width, height);
+    case 500 + CV_16SC1:
+        return common::invoke(height, {medianBlurC1<5, RVV_I16M1>}, src_data, src_step, dst_data, dst_step, width, height);
+    case 500 + CV_32FC1:
+        return common::invoke(height, {medianBlurC1<5, RVV_F32M1>}, src_data, src_step, dst_data, dst_step, width, height);
+
+    case 300 + CV_8UC4:
+        return common::invoke(height, {medianBlurC4<3>}, src_data, src_step, dst_data, dst_step, width, height);
+    case 500 + CV_8UC4:
+        return common::invoke(height, {medianBlurC4<5>}, src_data, src_step, dst_data, dst_step, width, height);
+    }
+
+    return CV_HAL_ERROR_NOT_IMPLEMENTED;
+}
+
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+}}} // cv::rvv_hal::imgproc
diff --git a/hal/riscv-rvv/hal_rvv_1p0/moments.hpp b/hal/riscv-rvv/src/imgproc/moments.cpp
similarity index 94%
rename from hal/riscv-rvv/hal_rvv_1p0/moments.hpp
rename to hal/riscv-rvv/src/imgproc/moments.cpp
index f0db8b3a17..c29f1edfd0 100644
--- a/hal/riscv-rvv/hal_rvv_1p0/moments.hpp
+++ b/hal/riscv-rvv/src/imgproc/moments.cpp
@@ -4,16 +4,13 @@
 
 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
 
-#ifndef OPENCV_HAL_RVV_MOMENTS_HPP_INCLUDED
-#define OPENCV_HAL_RVV_MOMENTS_HPP_INCLUDED
+#include "rvv_hal.hpp"
 
-#include <riscv_vector.h>
+namespace cv { namespace rvv_hal { namespace imgproc {
 
-namespace cv { namespace cv_hal_rvv {
+#if CV_HAL_RVV_1P0_ENABLED
 
-namespace imageMoments {
-#undef cv_hal_imageMoments
-#define cv_hal_imageMoments cv::cv_hal_rvv::imageMoments::imageMoments
+namespace {
 
 class MomentsInvoker : public ParallelLoopBody
 {
@@ -152,9 +149,11 @@ static inline int imageMoments(int start, int end, const uchar* src_data, size_t
     return CV_HAL_ERROR_OK;
 }
 
+} // anonymous
+
 // the algorithm is copied from imgproc/src/moments.cpp,
 // in the function cv::Moments cv::moments
-inline int imageMoments(const uchar* src_data, size_t src_step, int src_type, int width, int height, bool binary, double m[10])
+int imageMoments(const uchar* src_data, size_t src_step, int src_type, int width, int height, bool binary, double m[10])
 {
     if (src_type != CV_16UC1 && src_type != CV_16SC1 && src_type != CV_32FC1 && src_type != CV_64FC1)
         return CV_HAL_ERROR_NOT_IMPLEMENTED;
@@ -184,8 +183,7 @@ inline int imageMoments(const uchar* src_data, size_t src_step, int src_type, in
 
     return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
-} // cv::cv_hal_rvv::imageMoments
 
-}}
+#endif // CV_HAL_RVV_1P0_ENABLED
 
-#endif
+}}} // cv::rvv_hal::imgproc
diff --git a/hal/riscv-rvv/src/imgproc/morph.cpp b/hal/riscv-rvv/src/imgproc/morph.cpp
new file mode 100644
index 0000000000..e5d79b598b
--- /dev/null
+++ b/hal/riscv-rvv/src/imgproc/morph.cpp
@@ -0,0 +1,331 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
+
+#include "rvv_hal.hpp"
+#include "common.hpp"
+
+namespace cv { namespace rvv_hal { namespace imgproc {
+
+#if CV_HAL_RVV_1P0_ENABLED
+
+namespace {
+
+struct Morph2D
+{
+    int operation;
+    int src_type;
+    int dst_type;
+    int kernel_type;
+    uchar* kernel_data;
+    size_t kernel_step;
+    int kernel_width;
+    int kernel_height;
+    int anchor_x;
+    int anchor_y;
+    int borderType;
+    const uchar* borderValue;
+};
+
+template<int op> struct rvv;
+template<> struct rvv<CV_HAL_MORPH_ERODE>
+{
+    static inline uchar init() { return std::numeric_limits<uchar>::max(); }
+    static inline uchar mop(uchar a, uchar b) { return a < b ? a : b; }
+    static inline vuint8m4_t vop(vuint8m4_t a, vuint8m4_t b, size_t c) { return __riscv_vminu(a, b, c); }
+    static inline vuint8m4_t vop(vuint8m4_t a, uchar b, size_t c) { return __riscv_vminu(a, b, c); }
+};
+template<> struct rvv<CV_HAL_MORPH_DILATE>
+{
+    static inline uchar init() { return std::numeric_limits<uchar>::min(); }
+    static inline uchar mop(uchar a, uchar b) { return a > b ? a : b; }
+    static inline vuint8m4_t vop(vuint8m4_t a, vuint8m4_t b, size_t c) { return __riscv_vmaxu(a, b, c); }
+    static inline vuint8m4_t vop(vuint8m4_t a, uchar b, size_t c) { return __riscv_vmaxu(a, b, c); }
+};
+
+// the algorithm is copied from 3rdparty/carotene/src/morph.cpp,
+// in the function template void morph3x3
+template<int op>
+static inline int morph(int start, int end, Morph2D* data, const uchar* src_data, size_t src_step, uchar* dst_data, int width, int height, int full_width, int full_height, int offset_x, int offset_y)
+{
+    bool kernel[9];
+    for (int i = 0; i < 9; i++)
+    {
+        kernel[i] = data->kernel_data[(i / 3) * data->kernel_step + i % 3] != 0;
+    }
+
+    constexpr int noval = std::numeric_limits<int>::max();
+    auto access = [&](int x, int y) {
+        int pi, pj;
+        if (data->borderType & BORDER_ISOLATED)
+        {
+            pi = common::borderInterpolate(x - data->anchor_y, height, data->borderType & ~BORDER_ISOLATED);
+            pj = common::borderInterpolate(y - data->anchor_x, width , data->borderType & ~BORDER_ISOLATED);
+            pi = pi < 0 ? noval : pi;
+            pj = pj < 0 ? noval : pj;
+        }
+        else
+        {
+            pi = common::borderInterpolate(offset_y + x - data->anchor_y, full_height, data->borderType);
+            pj = common::borderInterpolate(offset_x + y - data->anchor_x, full_width , data->borderType);
+            pi = pi < 0 ? noval : pi - offset_y;
+            pj = pj < 0 ? noval : pj - offset_x;
+        }
+        return std::make_pair(pi, pj);
+    };
+
+    auto process = [&](int x, int y) {
+        if (data->src_type == CV_8UC1)
+        {
+            uchar val = rvv<op>::init();
+            for (int i = 0; i < 9; i++)
+            {
+                if (kernel[i])
+                {
+                    auto p = access(x + i / 3, y + i % 3);
+                    if (p.first != noval && p.second != noval)
+                    {
+                        val = rvv<op>::mop(val, src_data[p.first * src_step + p.second]);
+                    }
+                    else
+                    {
+                        val = rvv<op>::mop(val, data->borderValue[0]);
+                    }
+                }
+            }
+            dst_data[x * width + y] = val;
+        }
+        else
+        {
+            uchar val0, val1, val2, val3;
+            val0 = val1 = val2 = val3 = rvv<op>::init();
+            for (int i = 0; i < 9; i++)
+            {
+                if (kernel[i])
+                {
+                    auto p = access(x + i / 3, y + i % 3);
+                    if (p.first != noval && p.second != noval)
+                    {
+                        val0 = rvv<op>::mop(val0, src_data[p.first * src_step + p.second * 4    ]);
+                        val1 = rvv<op>::mop(val1, src_data[p.first * src_step + p.second * 4 + 1]);
+                        val2 = rvv<op>::mop(val2, src_data[p.first * src_step + p.second * 4 + 2]);
+                        val3 = rvv<op>::mop(val3, src_data[p.first * src_step + p.second * 4 + 3]);
+                    }
+                    else
+                    {
+                        val0 = rvv<op>::mop(val0, data->borderValue[0]);
+                        val1 = rvv<op>::mop(val1, data->borderValue[1]);
+                        val2 = rvv<op>::mop(val2, data->borderValue[2]);
+                        val3 = rvv<op>::mop(val3, data->borderValue[3]);
+                    }
+                }
+            }
+            dst_data[(x * width + y) * 4    ] = val0;
+            dst_data[(x * width + y) * 4 + 1] = val1;
+            dst_data[(x * width + y) * 4 + 2] = val2;
+            dst_data[(x * width + y) * 4 + 3] = val3;
+        }
+    };
+
+    const int left = data->anchor_x, right = width - (2 - data->anchor_x);
+    for (int i = start; i < end; i++)
+    {
+        if (left >= right)
+        {
+            for (int j = 0; j < width; j++)
+                process(i, j);
+        }
+        else
+        {
+            for (int j = 0; j < left; j++)
+                process(i, j);
+            for (int j = right; j < width; j++)
+                process(i, j);
+
+            const uchar* row0 = access(i    , 0).first == noval ? nullptr : src_data + access(i    , 0).first * src_step;
+            const uchar* row1 = access(i + 1, 0).first == noval ? nullptr : src_data + access(i + 1, 0).first * src_step;
+            const uchar* row2 = access(i + 2, 0).first == noval ? nullptr : src_data + access(i + 2, 0).first * src_step;
+            if (data->src_type == CV_8UC1)
+            {
+                int vl;
+                for (int j = left; j < right; j += vl)
+                {
+                    vl = __riscv_vsetvl_e8m4(right - j);
+                    auto m0 = __riscv_vmv_v_x_u8m4(rvv<op>::init(), vl);
+                    auto loadsrc = [&](const uchar* row, bool k0, bool k1, bool k2) {
+                        if (!row)
+                        {
+                            m0 = rvv<op>::vop(m0, data->borderValue[0], vl);
+                            return;
+                        }
+
+                        const uchar* extra = row + j - data->anchor_x;
+                        auto v0 = __riscv_vle8_v_u8m4(extra, vl);
+
+                        if (k0) m0 = rvv<op>::vop(m0, v0, vl);
+                        v0 = __riscv_vslide1down(v0, extra[vl], vl);
+                        if (k1) m0 = rvv<op>::vop(m0, v0, vl);
+                        if (!k2) return;
+                        v0 = __riscv_vslide1down(v0, extra[vl + 1], vl);
+                        m0 = rvv<op>::vop(m0, v0, vl);
+                    };
+
+                    loadsrc(row0, kernel[0], kernel[1], kernel[2]);
+                    loadsrc(row1, kernel[3], kernel[4], kernel[5]);
+                    loadsrc(row2, kernel[6], kernel[7], kernel[8]);
+                    __riscv_vse8(dst_data + i * width + j, m0, vl);
+                }
+            }
+            else
+            {
+                int vl, vl0, vl1;
+                for (int j = left; j < right; j += vl)
+                {
+                    vl = __riscv_vsetvl_e8m4(right - j);
+                    vl0 = std::min(vl, (int)__riscv_vlenb() * 2);
+                    vl1 = vl - vl0;
+                    auto m0 = __riscv_vmv_v_x_u8m4(rvv<op>::init(), vl);
+                    auto m1 = __riscv_vmv_v_x_u8m4(rvv<op>::init(), vl);
+                    auto m2 = __riscv_vmv_v_x_u8m4(rvv<op>::init(), vl);
+                    auto m3 = __riscv_vmv_v_x_u8m4(rvv<op>::init(), vl);
+
+                    auto opshift = [&](vuint8m4_t a, vuint8m4_t b, bool k0, bool k1, bool k2, uchar r1, uchar r2) {
+                        if (k0) a = rvv<op>::vop(a, b, vl);
+                        b = __riscv_vslide1down(b, r1, vl);
+                        if (k1) a = rvv<op>::vop(a, b, vl);
+                        if (!k2) return a;
+                        b = __riscv_vslide1down(b, r2, vl);
+                        return rvv<op>::vop(a, b, vl);
+                    };
+                    auto loadsrc = [&](const uchar* row, bool k0, bool k1, bool k2) {
+                        if (!row)
+                        {
+                            m0 = rvv<op>::vop(m0, data->borderValue[0], vl);
+                            m1 = rvv<op>::vop(m1, data->borderValue[1], vl);
+                            m2 = rvv<op>::vop(m2, data->borderValue[2], vl);
+                            m3 = rvv<op>::vop(m3, data->borderValue[3], vl);
+                            return;
+                        }
+
+                        vuint8m4_t v0{}, v1{}, v2{}, v3{};
+                        const uchar* extra = row + (j - data->anchor_x) * 4;
+                        auto src = __riscv_vlseg4e8_v_u8m2x4(extra, vl0);
+                        v0 = __riscv_vset_v_u8m2_u8m4(v0, 0, __riscv_vget_v_u8m2x4_u8m2(src, 0));
+                        v1 = __riscv_vset_v_u8m2_u8m4(v1, 0, __riscv_vget_v_u8m2x4_u8m2(src, 1));
+                        v2 = __riscv_vset_v_u8m2_u8m4(v2, 0, __riscv_vget_v_u8m2x4_u8m2(src, 2));
+                        v3 = __riscv_vset_v_u8m2_u8m4(v3, 0, __riscv_vget_v_u8m2x4_u8m2(src, 3));
+                        src = __riscv_vlseg4e8_v_u8m2x4(extra + vl0 * 4, vl1);
+                        v0 = __riscv_vset_v_u8m2_u8m4(v0, 1, __riscv_vget_v_u8m2x4_u8m2(src, 0));
+                        v1 = __riscv_vset_v_u8m2_u8m4(v1, 1, __riscv_vget_v_u8m2x4_u8m2(src, 1));
+                        v2 = __riscv_vset_v_u8m2_u8m4(v2, 1, __riscv_vget_v_u8m2x4_u8m2(src, 2));
+                        v3 = __riscv_vset_v_u8m2_u8m4(v3, 1, __riscv_vget_v_u8m2x4_u8m2(src, 3));
+
+                        extra += vl * 4;
+                        m0 = opshift(m0, v0, k0, k1, k2, extra[0], extra[4]);
+                        m1 = opshift(m1, v1, k0, k1, k2, extra[1], extra[5]);
+                        m2 = opshift(m2, v2, k0, k1, k2, extra[2], extra[6]);
+                        m3 = opshift(m3, v3, k0, k1, k2, extra[3], extra[7]);
+                    };
+
+                    loadsrc(row0, kernel[0], kernel[1], kernel[2]);
+                    loadsrc(row1, kernel[3], kernel[4], kernel[5]);
+                    loadsrc(row2, kernel[6], kernel[7], kernel[8]);
+                    vuint8m2x4_t val{};
+                    val = __riscv_vset_v_u8m2_u8m2x4(val, 0, __riscv_vget_v_u8m4_u8m2(m0, 0));
+                    val = __riscv_vset_v_u8m2_u8m2x4(val, 1, __riscv_vget_v_u8m4_u8m2(m1, 0));
+                    val = __riscv_vset_v_u8m2_u8m2x4(val, 2, __riscv_vget_v_u8m4_u8m2(m2, 0));
+                    val = __riscv_vset_v_u8m2_u8m2x4(val, 3, __riscv_vget_v_u8m4_u8m2(m3, 0));
+                    __riscv_vsseg4e8(dst_data + (i * width + j) * 4, val, vl0);
+                    val = __riscv_vset_v_u8m2_u8m2x4(val, 0, __riscv_vget_v_u8m4_u8m2(m0, 1));
+                    val = __riscv_vset_v_u8m2_u8m2x4(val, 1, __riscv_vget_v_u8m4_u8m2(m1, 1));
+                    val = __riscv_vset_v_u8m2_u8m2x4(val, 2, __riscv_vget_v_u8m4_u8m2(m2, 1));
+                    val = __riscv_vset_v_u8m2_u8m2x4(val, 3, __riscv_vget_v_u8m4_u8m2(m3, 1));
+                    __riscv_vsseg4e8(dst_data + (i * width + j + vl0) * 4, val, vl1);
+                }
+            }
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+} // anonymous
+
+int morphInit(cvhalFilter2D** context, int operation, int src_type, int dst_type, int /*max_width*/, int /*max_height*/, int kernel_type, uchar* kernel_data, size_t kernel_step, int kernel_width, int kernel_height, int anchor_x, int anchor_y, int borderType, const double borderValue[4], int iterations, bool /*allowSubmatrix*/, bool /*allowInplace*/)
+{
+    if (kernel_type != CV_8UC1 || src_type != dst_type)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    if (src_type != CV_8UC1 && src_type != CV_8UC4)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    if (kernel_width != kernel_height || kernel_width != 3)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    if (iterations != 1)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    if (operation != CV_HAL_MORPH_ERODE && operation != CV_HAL_MORPH_DILATE)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    if ((borderType & ~BORDER_ISOLATED) == BORDER_WRAP)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    uchar* borderV;
+    if (src_type == CV_8UC1)
+    {
+        borderV = new uchar{static_cast<uchar>(borderValue[0])};
+        if (operation == CV_HAL_MORPH_DILATE && borderValue[0] == DBL_MAX)
+            borderV[0] = 0;
+    }
+    else
+    {
+        borderV = new uchar[4]{static_cast<uchar>(borderValue[0]), static_cast<uchar>(borderValue[1]), static_cast<uchar>(borderValue[2]), static_cast<uchar>(borderValue[3])};
+        if (operation == CV_HAL_MORPH_DILATE)
+        {
+            if (borderValue[0] == DBL_MAX)
+                borderV[0] = 0;
+            if (borderValue[1] == DBL_MAX)
+                borderV[1] = 0;
+            if (borderValue[2] == DBL_MAX)
+                borderV[2] = 0;
+            if (borderValue[3] == DBL_MAX)
+                borderV[3] = 0;
+        }
+    }
+
+    anchor_x = anchor_x < 0 ? kernel_width  / 2 : anchor_x;
+    anchor_y = anchor_y < 0 ? kernel_height / 2 : anchor_y;
+    *context = reinterpret_cast<cvhalFilter2D*>(new Morph2D{operation, src_type, dst_type, kernel_type, kernel_data, kernel_step, kernel_width, kernel_height, anchor_x, anchor_y, borderType, borderV});
+    return CV_HAL_ERROR_OK;
+}
+
+int morph(cvhalFilter2D* context, uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int src_full_width, int src_full_height, int src_roi_x, int src_roi_y, int /*dst_full_width*/, int /*dst_full_height*/, int /*dst_roi_x*/, int /*dst_roi_y*/)
+{
+    Morph2D* data = reinterpret_cast<Morph2D*>(context);
+    int cn = data->src_type == CV_8UC1 ? 1 : 4;
+    std::vector<uchar> dst(width * height * cn);
+
+    int res = CV_HAL_ERROR_NOT_IMPLEMENTED;
+    switch (data->operation)
+    {
+    case CV_HAL_MORPH_ERODE:
+        res = common::invoke(height, {morph<CV_HAL_MORPH_ERODE>}, data, src_data, src_step, dst.data(), width, height, src_full_width, src_full_height, src_roi_x, src_roi_y);
+        break;
+    case CV_HAL_MORPH_DILATE:
+        res = common::invoke(height, {morph<CV_HAL_MORPH_DILATE>}, data, src_data, src_step, dst.data(), width, height, src_full_width, src_full_height, src_roi_x, src_roi_y);
+        break;
+    }
+
+    for (int i = 0; i < height; i++)
+        memcpy(dst_data + i * dst_step, dst.data() + i * width * cn, width * cn);
+    return res;
+}
+
+int morphFree(cvhalFilter2D* context)
+{
+    delete reinterpret_cast<Morph2D*>(context)->borderValue;
+    delete reinterpret_cast<Morph2D*>(context);
+    return CV_HAL_ERROR_OK;
+}
+
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+}}} // cv::rvv_hal::imgproc
diff --git a/hal/riscv-rvv/hal_rvv_1p0/pyramids.hpp b/hal/riscv-rvv/src/imgproc/pyramids.cpp
similarity index 97%
rename from hal/riscv-rvv/hal_rvv_1p0/pyramids.hpp
rename to hal/riscv-rvv/src/imgproc/pyramids.cpp
index a349d341c5..66bf4c1b4d 100644
--- a/hal/riscv-rvv/hal_rvv_1p0/pyramids.hpp
+++ b/hal/riscv-rvv/src/imgproc/pyramids.cpp
@@ -4,18 +4,13 @@
 
 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
 
-#ifndef OPENCV_HAL_RVV_PYRAMIDS_HPP_INCLUDED
-#define OPENCV_HAL_RVV_PYRAMIDS_HPP_INCLUDED
+#include "rvv_hal.hpp"
 
-#include <riscv_vector.h>
-#include "hal_rvv_1p0/types.hpp"
+namespace cv { namespace rvv_hal { namespace imgproc {
 
-namespace cv { namespace cv_hal_rvv { namespace pyramids {
+#if CV_HAL_RVV_1P0_ENABLED
 
-#undef cv_hal_pyrdown
-#define cv_hal_pyrdown cv::cv_hal_rvv::pyramids::pyrDown
-#undef cv_hal_pyrup
-#define cv_hal_pyrup cv::cv_hal_rvv::pyramids::pyrUp
+namespace {
 
 template<typename T> struct rvv;
 
@@ -562,7 +557,9 @@ inline int pyrUp(const uchar* src_data, size_t src_step, int src_width, int src_
     return CV_HAL_ERROR_OK;
 }
 
-inline int pyrDown(const uchar* src_data, size_t src_step, int src_width, int src_height, uchar* dst_data, size_t dst_step, int dst_width, int dst_height, int depth, int cn, int border_type)
+} // anonymous
+
+int pyrDown(const uchar* src_data, size_t src_step, int src_width, int src_height, uchar* dst_data, size_t dst_step, int dst_width, int dst_height, int depth, int cn, int border_type)
 {
     if (border_type == BORDER_CONSTANT || (depth == CV_32F && cn == 1))
         return CV_HAL_ERROR_NOT_IMPLEMENTED;
@@ -580,7 +577,7 @@ inline int pyrDown(const uchar* src_data, size_t src_step, int src_width, int sr
     return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
 
-inline int pyrUp(const uchar* src_data, size_t src_step, int src_width, int src_height, uchar* dst_data, size_t dst_step, int dst_width, int dst_height, int depth, int cn, int border_type)
+int pyrUp(const uchar* src_data, size_t src_step, int src_width, int src_height, uchar* dst_data, size_t dst_step, int dst_width, int dst_height, int depth, int cn, int border_type)
 {
     if (border_type != BORDER_DEFAULT)
         return CV_HAL_ERROR_NOT_IMPLEMENTED;
@@ -598,6 +595,6 @@ inline int pyrUp(const uchar* src_data, size_t src_step, int src_width, int src_
     return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
 
-}}}
+#endif // CV_HAL_RVV_1P0_ENABLED
 
-#endif
+}}} // cv::rvv_hal::imgproc
diff --git a/hal/riscv-rvv/hal_rvv_1p0/resize.hpp b/hal/riscv-rvv/src/imgproc/resize.cpp
similarity index 99%
rename from hal/riscv-rvv/hal_rvv_1p0/resize.hpp
rename to hal/riscv-rvv/src/imgproc/resize.cpp
index d18db5f058..1ce5e16bb3 100644
--- a/hal/riscv-rvv/hal_rvv_1p0/resize.hpp
+++ b/hal/riscv-rvv/src/imgproc/resize.cpp
@@ -4,17 +4,15 @@
 
 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
 
-#ifndef OPENCV_HAL_RVV_RESIZE_HPP_INCLUDED
-#define OPENCV_HAL_RVV_RESIZE_HPP_INCLUDED
-
-#include <riscv_vector.h>
+#include "rvv_hal.hpp"
+#include "common.hpp"
 #include <list>
 
-namespace cv { namespace cv_hal_rvv {
+namespace cv { namespace rvv_hal { namespace imgproc {
 
-namespace resize {
-#undef cv_hal_resize
-#define cv_hal_resize cv::cv_hal_rvv::resize::resize
+#if CV_HAL_RVV_1P0_ENABLED
+
+namespace {
 
 class ResizeInvoker : public ParallelLoopBody
 {
@@ -986,7 +984,9 @@ static inline int resizeArea(int src_type, const uchar *src_data, size_t src_ste
     return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
 
-inline int resize(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, int dst_height, double inv_scale_x, double inv_scale_y, int interpolation)
+} // anonymous
+
+int resize(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, int dst_height, double inv_scale_x, double inv_scale_y, int interpolation)
 {
     inv_scale_x = 1 / inv_scale_x;
     inv_scale_y = 1 / inv_scale_y;
@@ -999,8 +999,7 @@ inline int resize(int src_type, const uchar *src_data, size_t src_step, int src_
 
     return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
-} // cv::cv_hal_rvv::resize
 
-}}
+#endif // CV_HAL_RVV_1P0_ENABLED
 
-#endif
+}}} // cv::rvv_hal::imgproc
diff --git a/hal/riscv-rvv/src/imgproc/sep_filter.cpp b/hal/riscv-rvv/src/imgproc/sep_filter.cpp
new file mode 100644
index 0000000000..54267683e5
--- /dev/null
+++ b/hal/riscv-rvv/src/imgproc/sep_filter.cpp
@@ -0,0 +1,259 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
+
+#include "rvv_hal.hpp"
+#include "common.hpp"
+
+namespace cv { namespace rvv_hal { namespace imgproc {
+
+#if CV_HAL_RVV_1P0_ENABLED
+
+namespace {
+
+struct sepFilter2D
+{
+    int src_type;
+    int dst_type;
+    int kernel_type;
+    const uchar* kernelx_data;
+    int kernelx_length;
+    const uchar* kernely_data;
+    int kernely_length;
+    int anchor_x;
+    int anchor_y;
+    double delta;
+    int borderType;
+};
+
+// the algorithm is copied from 3rdparty/carotene/src/separable_filter.hpp,
+// in the functor RowFilter3x3S16Generic and ColFilter3x3S16Generic
+template<int ksize, typename T>
+static inline int sepFilter(int start, int end, sepFilter2D* data, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int full_width, int full_height, int offset_x, int offset_y)
+{
+    constexpr int noval = std::numeric_limits<int>::max();
+    auto accessX = [&](int x) {
+        int pi;
+        if (data->borderType & BORDER_ISOLATED)
+        {
+            pi = common::borderInterpolate(x - data->anchor_y, height, data->borderType & ~BORDER_ISOLATED);
+            pi = pi < 0 ? noval : pi;
+        }
+        else
+        {
+            pi = common::borderInterpolate(offset_y + x - data->anchor_y, full_height, data->borderType);
+            pi = pi < 0 ? noval : pi - offset_y;
+        }
+        return pi;
+    };
+    auto accessY = [&](int y) {
+        int pj;
+        if (data->borderType & BORDER_ISOLATED)
+        {
+            pj = common::borderInterpolate(y - data->anchor_x, width, data->borderType & ~BORDER_ISOLATED);
+            pj = pj < 0 ? noval : pj;
+        }
+        else
+        {
+            pj = common::borderInterpolate(offset_x + y - data->anchor_x, full_width, data->borderType);
+            pj = pj < 0 ? noval : pj - offset_x;
+        }
+        return pj;
+    };
+    auto p2idx = [&](int x, int y){ return (x + ksize) % ksize * width + y; };
+
+    const float* kx = reinterpret_cast<const float*>(data->kernelx_data);
+    const float* ky = reinterpret_cast<const float*>(data->kernely_data);
+    std::vector<float> res(width * ksize);
+    auto process = [&](int x, int y) {
+        float sum = 0;
+        for (int i = 0; i < ksize; i++)
+        {
+            int p = accessY(y + i);
+            if (p != noval)
+            {
+                sum += kx[i] * reinterpret_cast<const T*>(src_data + x * src_step)[p];
+            }
+        }
+        res[p2idx(x, y)] = sum;
+    };
+
+    const int left = data->anchor_x, right = width - (ksize - 1 - data->anchor_x);
+    for (int i = start - data->anchor_y; i < end + (ksize - 1 - data->anchor_y); i++)
+    {
+        if (i + offset_y >= 0 && i + offset_y < full_height)
+        {
+            if (left >= right)
+            {
+                for (int j = 0; j < width; j++)
+                    process(i, j);
+            }
+            else
+            {
+                for (int j = 0; j < left; j++)
+                    process(i, j);
+                for (int j = right; j < width; j++)
+                    process(i, j);
+
+                int vl;
+                for (int j = left; j < right; j += vl)
+                {
+                    vl = __riscv_vsetvl_e8m2(right - j);
+                    const T* extra = reinterpret_cast<const T*>(src_data + i * src_step) + j - data->anchor_x;
+                    vfloat32m8_t src;
+                    if (std::is_same<T, uchar>::value)
+                    {
+                        src = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vle8_v_u8m2(reinterpret_cast<const uchar*>(extra), vl), vl), vl);
+                    }
+                    else if (std::is_same<T, short>::value)
+                    {
+                        src = __riscv_vfwcvt_f(__riscv_vle16_v_i16m4(reinterpret_cast<const short*>(extra), vl), vl);
+                    }
+                    else
+                    {
+                        src = __riscv_vle32_v_f32m8(reinterpret_cast<const float*>(extra), vl);
+                    }
+
+                    extra += vl;
+                    auto sum = __riscv_vfmul(src, kx[0], vl);
+                    src = __riscv_vfslide1down(src, extra[0], vl);
+                    sum = __riscv_vfmacc(sum, kx[1], src, vl);
+                    src = __riscv_vfslide1down(src, extra[1], vl);
+                    sum = __riscv_vfmacc(sum, kx[2], src, vl);
+                    if (ksize == 5)
+                    {
+                        src = __riscv_vfslide1down(src, extra[2], vl);
+                        sum = __riscv_vfmacc(sum, kx[3], src, vl);
+                        src = __riscv_vfslide1down(src, extra[3], vl);
+                        sum = __riscv_vfmacc(sum, kx[4], src, vl);
+                    }
+                    __riscv_vse32(res.data() + p2idx(i, j), sum, vl);
+                }
+            }
+        }
+
+        int cur = i - (ksize - 1 - data->anchor_y);
+        if (cur >= start)
+        {
+            const float* row0 = accessX(cur    ) == noval ? nullptr : res.data() + p2idx(accessX(cur    ), 0);
+            const float* row1 = accessX(cur + 1) == noval ? nullptr : res.data() + p2idx(accessX(cur + 1), 0);
+            const float* row2 = accessX(cur + 2) == noval ? nullptr : res.data() + p2idx(accessX(cur + 2), 0);
+            const float* row3 = nullptr, *row4 = nullptr;
+            if (ksize == 5)
+            {
+                row3 = accessX(cur + 3) == noval ? nullptr : res.data() + p2idx(accessX(cur + 3), 0);
+                row4 = accessX(cur + 4) == noval ? nullptr : res.data() + p2idx(accessX(cur + 4), 0);
+            }
+
+            int vl;
+            for (int j = 0; j < width; j += vl)
+            {
+                vl = __riscv_vsetvl_e32m4(width - j);
+                auto v0 = row0 ? __riscv_vle32_v_f32m4(row0 + j, vl) : __riscv_vfmv_v_f_f32m4(0, vl);
+                auto v1 = row1 ? __riscv_vle32_v_f32m4(row1 + j, vl) : __riscv_vfmv_v_f_f32m4(0, vl);
+                auto v2 = row2 ? __riscv_vle32_v_f32m4(row2 + j, vl) : __riscv_vfmv_v_f_f32m4(0, vl);
+                auto sum = __riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmv_v_f_f32m4(data->delta, vl), ky[0], v0, vl), ky[1], v1, vl), ky[2], v2, vl);
+
+                if (ksize == 5)
+                {
+                    auto v3 = row3 ? __riscv_vle32_v_f32m4(row3 + j, vl) : __riscv_vfmv_v_f_f32m4(0, vl);
+                    auto v4 = row4 ? __riscv_vle32_v_f32m4(row4 + j, vl) : __riscv_vfmv_v_f_f32m4(0, vl);
+                    sum = __riscv_vfmacc(__riscv_vfmacc(sum, ky[3], v3, vl), ky[4], v4, vl);
+                }
+
+                if (data->dst_type == CV_16SC1)
+                {
+                    __riscv_vse16(reinterpret_cast<short*>(dst_data + cur * dst_step) + j, __riscv_vfncvt_x(sum, vl), vl);
+                }
+                else
+                {
+                    __riscv_vse32(reinterpret_cast<float*>(dst_data + cur * dst_step) + j, sum, vl);
+                }
+            }
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+} // anonymous
+
+int sepFilterInit(cvhalFilter2D **context, int src_type, int dst_type, int kernel_type, uchar* kernelx_data, int kernelx_length, uchar* kernely_data, int kernely_length, int anchor_x, int anchor_y, double delta, int borderType)
+{
+    if (kernel_type != CV_32FC1)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    if (src_type != CV_8UC1 && src_type != CV_16SC1 && src_type != CV_32FC1)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    if (dst_type != CV_16SC1 && dst_type != CV_32FC1)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    if ((kernelx_length != 3 && kernelx_length != 5) || kernelx_length != kernely_length)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    if ((borderType & ~BORDER_ISOLATED) == BORDER_WRAP)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    anchor_x = anchor_x < 0 ? kernelx_length / 2 : anchor_x;
+    anchor_y = anchor_y < 0 ? kernely_length / 2 : anchor_y;
+    *context = reinterpret_cast<cvhalFilter2D*>(new sepFilter2D{src_type, dst_type, kernel_type, kernelx_data, kernelx_length, kernely_data, kernely_length, anchor_x, anchor_y, delta, borderType & ~BORDER_ISOLATED});
+    return CV_HAL_ERROR_OK;
+}
+
+int sepFilter(cvhalFilter2D *context, uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int full_width, int full_height, int offset_x, int offset_y)
+{
+    sepFilter2D* data = reinterpret_cast<sepFilter2D*>(context);
+
+    uchar* _dst_data = dst_data;
+    size_t _dst_step = dst_step;
+    const size_t size = CV_ELEM_SIZE(data->dst_type);
+    std::vector<uchar> dst;
+    if (src_data == _dst_data)
+    {
+        dst = std::vector<uchar>(width * height * size);
+        dst_data = dst.data();
+        dst_step = width * size;
+    }
+
+    int res = CV_HAL_ERROR_NOT_IMPLEMENTED;
+    switch (data->kernelx_length*100 + data->src_type)
+    {
+    case 300 + CV_8UC1:
+        res = common::invoke(height, {sepFilter<3, uchar>}, data, src_data, src_step, dst_data, dst_step, width, height, full_width, full_height, offset_x, offset_y);
+        break;
+    case 500 + CV_8UC1:
+        res = common::invoke(height, {sepFilter<5, uchar>}, data, src_data, src_step, dst_data, dst_step, width, height, full_width, full_height, offset_x, offset_y);
+        break;
+    case 300 + CV_16SC1:
+        res = common::invoke(height, {sepFilter<3, short>}, data, src_data, src_step, dst_data, dst_step, width, height, full_width, full_height, offset_x, offset_y);
+        break;
+    case 500 + CV_16SC1:
+        res = common::invoke(height, {sepFilter<5, short>}, data, src_data, src_step, dst_data, dst_step, width, height, full_width, full_height, offset_x, offset_y);
+        break;
+    case 300 + CV_32FC1:
+        res = common::invoke(height, {sepFilter<3, float>}, data, src_data, src_step, dst_data, dst_step, width, height, full_width, full_height, offset_x, offset_y);
+        break;
+    case 500 + CV_32FC1:
+        res = common::invoke(height, {sepFilter<5, float>}, data, src_data, src_step, dst_data, dst_step, width, height, full_width, full_height, offset_x, offset_y);
+        break;
+    }
+    if (res == CV_HAL_ERROR_NOT_IMPLEMENTED)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    if (src_data == _dst_data)
+    {
+        for (int i = 0; i < height; i++)
+            memcpy(_dst_data + i * _dst_step, dst.data() + i * dst_step, dst_step);
+    }
+
+    return res;
+}
+
+int sepFilterFree(cvhalFilter2D* context)
+{
+    delete reinterpret_cast<sepFilter2D*>(context);
+    return CV_HAL_ERROR_OK;
+}
+
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+}}} // cv::rvv_hal::imgproc
diff --git a/hal/riscv-rvv/hal_rvv_1p0/thresh.hpp b/hal/riscv-rvv/src/imgproc/threshold.cpp
similarity index 86%
rename from hal/riscv-rvv/hal_rvv_1p0/thresh.hpp
rename to hal/riscv-rvv/src/imgproc/threshold.cpp
index 738e3d5012..8d76b5626d 100644
--- a/hal/riscv-rvv/hal_rvv_1p0/thresh.hpp
+++ b/hal/riscv-rvv/src/imgproc/threshold.cpp
@@ -4,18 +4,15 @@
 
 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
 
-#ifndef OPENCV_HAL_RVV_THRESH_HPP_INCLUDED
-#define OPENCV_HAL_RVV_THRESH_HPP_INCLUDED
-
-#include <riscv_vector.h>
+#include "rvv_hal.hpp"
+#include "common.hpp"
 #include <atomic>
 
-namespace cv { namespace cv_hal_rvv {
+namespace cv { namespace rvv_hal { namespace imgproc {
 
-namespace threshold {
-// disabled since UI is fast enough, only called in threshold_otsu
-// #undef cv_hal_threshold
-// #define cv_hal_threshold cv::cv_hal_rvv::threshold::threshold
+#if CV_HAL_RVV_1P0_ENABLED
+
+namespace {
 
 class ThresholdInvoker : public ParallelLoopBody
 {
@@ -182,16 +179,6 @@ static inline int threshold_range(int start, int end, const uchar* src_data, siz
     return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
 
-inline int threshold(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int depth, int cn, double thresh, double maxValue, int thresholdType)
-{
-    return threshold_range(0, height, src_data, src_step, dst_data, dst_step, width, depth, cn, thresh, maxValue, thresholdType);
-}
-} // cv::cv_hal_rvv::threshold
-
-namespace threshold_otsu {
-#undef cv_hal_threshold_otsu
-#define cv_hal_threshold_otsu cv::cv_hal_rvv::threshold_otsu::threshold_otsu
-
 static inline int otsu(int start, int end, const uchar* src_data, size_t src_step, int width, std::atomic<int>* cnt, int N, int* h)
 {
     const int c = cnt->fetch_add(1) % cv::getNumThreads();
@@ -205,69 +192,6 @@ static inline int otsu(int start, int end, const uchar* src_data, size_t src_ste
     return CV_HAL_ERROR_OK;
 }
 
-// the algorithm is copied from imgproc/src/thresh.cpp,
-// in the function template static double getThreshVal_Otsu
-inline int threshold_otsu(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int depth, double maxValue, int thresholdType, double* thresh)
-{
-    if (depth != CV_8UC1 || width * height < (1 << 15))
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-
-    const int N = std::numeric_limits<uchar>::max() + 1;
-    const int nums = cv::getNumThreads();
-    std::vector<int> _h(N * nums, 0);
-    int* h = _h.data();
-
-    std::atomic<int> cnt(0);
-    cv::parallel_for_(Range(0, height), threshold::ThresholdInvoker({otsu}, src_data, src_step, width, &cnt, N, h), nums);
-    for (int i = N; i < nums * N; i++)
-    {
-        h[i % N] += h[i];
-    }
-
-    double mu = 0, scale = 1. / (width*height);
-    for (int i = 0; i < N; i++)
-    {
-        mu += i*(double)h[i];
-    }
-
-    mu *= scale;
-    double mu1 = 0, q1 = 0;
-    double max_sigma = 0, max_val = 0;
-
-    for (int i = 0; i < N; i++)
-    {
-        double p_i, q2, mu2, sigma;
-
-        p_i = h[i]*scale;
-        mu1 *= q1;
-        q1 += p_i;
-        q2 = 1. - q1;
-
-        if (std::min(q1,q2) < FLT_EPSILON || std::max(q1,q2) > 1. - FLT_EPSILON)
-            continue;
-
-        mu1 = (mu1 + i*p_i)/q1;
-        mu2 = (mu - q1*mu1)/q2;
-        sigma = q1*q2*(mu1 - mu2)*(mu1 - mu2);
-        if (sigma > max_sigma)
-        {
-            max_sigma = sigma;
-            max_val = i;
-        }
-    }
-
-    *thresh = max_val;
-    if (dst_data == nullptr)
-        return CV_HAL_ERROR_OK;
-
-    return threshold::invoke(width, height, {threshold::threshold_range}, src_data, src_step, dst_data, dst_step, width, depth, 1, max_val, maxValue, thresholdType);
-}
-} // cv::cv_hal_rvv::threshold_otsu
-
-namespace adaptiveThreshold {
-#undef cv_hal_adaptiveThreshold
-#define cv_hal_adaptiveThreshold cv::cv_hal_rvv::adaptiveThreshold::adaptiveThreshold
-
 // the algorithm is copied from imgproc/src/thresh.cpp,
 // in the function void cv::adaptiveThreshold
 template<int ksize, int method, int type>
@@ -444,7 +368,72 @@ static inline int adaptiveThreshold(int start, int end, const uchar* src_data, s
     return CV_HAL_ERROR_OK;
 }
 
-inline int adaptiveThreshold(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, double maxValue, int adaptiveMethod, int thresholdType, int blockSize, double C)
+} // anonymous
+
+int threshold(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int depth, int cn, double thresh, double maxValue, int thresholdType)
+{
+    return threshold_range(0, height, src_data, src_step, dst_data, dst_step, width, depth, cn, thresh, maxValue, thresholdType);
+}
+
+// the algorithm is copied from imgproc/src/thresh.cpp,
+// in the function template static double getThreshVal_Otsu
+int threshold_otsu(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int depth, double maxValue, int thresholdType, double* thresh)
+{
+    if (depth != CV_8UC1 || width * height < (1 << 15))
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    const int N = std::numeric_limits<uchar>::max() + 1;
+    const int nums = cv::getNumThreads();
+    std::vector<int> _h(N * nums, 0);
+    int* h = _h.data();
+
+    std::atomic<int> cnt(0);
+    cv::parallel_for_(Range(0, height), ThresholdInvoker({otsu}, src_data, src_step, width, &cnt, N, h), nums);
+    for (int i = N; i < nums * N; i++)
+    {
+        h[i % N] += h[i];
+    }
+
+    double mu = 0, scale = 1. / (width*height);
+    for (int i = 0; i < N; i++)
+    {
+        mu += i*(double)h[i];
+    }
+
+    mu *= scale;
+    double mu1 = 0, q1 = 0;
+    double max_sigma = 0, max_val = 0;
+
+    for (int i = 0; i < N; i++)
+    {
+        double p_i, q2, mu2, sigma;
+
+        p_i = h[i]*scale;
+        mu1 *= q1;
+        q1 += p_i;
+        q2 = 1. - q1;
+
+        if (std::min(q1,q2) < FLT_EPSILON || std::max(q1,q2) > 1. - FLT_EPSILON)
+            continue;
+
+        mu1 = (mu1 + i*p_i)/q1;
+        mu2 = (mu - q1*mu1)/q2;
+        sigma = q1*q2*(mu1 - mu2)*(mu1 - mu2);
+        if (sigma > max_sigma)
+        {
+            max_sigma = sigma;
+            max_val = i;
+        }
+    }
+
+    *thresh = max_val;
+    if (dst_data == nullptr)
+        return CV_HAL_ERROR_OK;
+
+    return invoke(width, height, {threshold_range}, src_data, src_step, dst_data, dst_step, width, depth, 1, max_val, maxValue, thresholdType);
+}
+
+int adaptiveThreshold(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, double maxValue, int adaptiveMethod, int thresholdType, int blockSize, double C)
 {
     if (thresholdType != CV_HAL_THRESH_BINARY && thresholdType != CV_HAL_THRESH_BINARY_INV)
         return CV_HAL_ERROR_NOT_IMPLEMENTED;
@@ -456,27 +445,26 @@ inline int adaptiveThreshold(const uchar* src_data, size_t src_step, uchar* dst_
     switch (blockSize*100 + adaptiveMethod*10 + thresholdType)
     {
     case 300 + CV_HAL_ADAPTIVE_THRESH_MEAN_C*10 + CV_HAL_THRESH_BINARY:
-        return threshold::invoke(width, height, {adaptiveThreshold<3, CV_HAL_ADAPTIVE_THRESH_MEAN_C, CV_HAL_THRESH_BINARY>}, src_data, src_step, dst_data, dst_step, width, height, maxValue, C);
+        return invoke(width, height, {adaptiveThreshold<3, CV_HAL_ADAPTIVE_THRESH_MEAN_C, CV_HAL_THRESH_BINARY>}, src_data, src_step, dst_data, dst_step, width, height, maxValue, C);
     case 300 + CV_HAL_ADAPTIVE_THRESH_MEAN_C*10 + CV_HAL_THRESH_BINARY_INV:
-        return threshold::invoke(width, height, {adaptiveThreshold<3, CV_HAL_ADAPTIVE_THRESH_MEAN_C, CV_HAL_THRESH_BINARY_INV>}, src_data, src_step, dst_data, dst_step, width, height, maxValue, C);
+        return invoke(width, height, {adaptiveThreshold<3, CV_HAL_ADAPTIVE_THRESH_MEAN_C, CV_HAL_THRESH_BINARY_INV>}, src_data, src_step, dst_data, dst_step, width, height, maxValue, C);
     case 500 + CV_HAL_ADAPTIVE_THRESH_MEAN_C*10 + CV_HAL_THRESH_BINARY:
-        return threshold::invoke(width, height, {adaptiveThreshold<5, CV_HAL_ADAPTIVE_THRESH_MEAN_C, CV_HAL_THRESH_BINARY>}, src_data, src_step, dst_data, dst_step, width, height, maxValue, C);
+        return invoke(width, height, {adaptiveThreshold<5, CV_HAL_ADAPTIVE_THRESH_MEAN_C, CV_HAL_THRESH_BINARY>}, src_data, src_step, dst_data, dst_step, width, height, maxValue, C);
     case 500 + CV_HAL_ADAPTIVE_THRESH_MEAN_C*10 + CV_HAL_THRESH_BINARY_INV:
-        return threshold::invoke(width, height, {adaptiveThreshold<5, CV_HAL_ADAPTIVE_THRESH_MEAN_C, CV_HAL_THRESH_BINARY_INV>}, src_data, src_step, dst_data, dst_step, width, height, maxValue, C);
+        return invoke(width, height, {adaptiveThreshold<5, CV_HAL_ADAPTIVE_THRESH_MEAN_C, CV_HAL_THRESH_BINARY_INV>}, src_data, src_step, dst_data, dst_step, width, height, maxValue, C);
     case 300 + CV_HAL_ADAPTIVE_THRESH_GAUSSIAN_C*10 + CV_HAL_THRESH_BINARY:
-        return threshold::invoke(width, height, {adaptiveThreshold<3, CV_HAL_ADAPTIVE_THRESH_GAUSSIAN_C, CV_HAL_THRESH_BINARY>}, src_data, src_step, dst_data, dst_step, width, height, maxValue, C);
+        return invoke(width, height, {adaptiveThreshold<3, CV_HAL_ADAPTIVE_THRESH_GAUSSIAN_C, CV_HAL_THRESH_BINARY>}, src_data, src_step, dst_data, dst_step, width, height, maxValue, C);
     case 300 + CV_HAL_ADAPTIVE_THRESH_GAUSSIAN_C*10 + CV_HAL_THRESH_BINARY_INV:
-        return threshold::invoke(width, height, {adaptiveThreshold<3, CV_HAL_ADAPTIVE_THRESH_GAUSSIAN_C, CV_HAL_THRESH_BINARY_INV>}, src_data, src_step, dst_data, dst_step, width, height, maxValue, C);
+        return invoke(width, height, {adaptiveThreshold<3, CV_HAL_ADAPTIVE_THRESH_GAUSSIAN_C, CV_HAL_THRESH_BINARY_INV>}, src_data, src_step, dst_data, dst_step, width, height, maxValue, C);
     case 500 + CV_HAL_ADAPTIVE_THRESH_GAUSSIAN_C*10 + CV_HAL_THRESH_BINARY:
-        return threshold::invoke(width, height, {adaptiveThreshold<5, CV_HAL_ADAPTIVE_THRESH_GAUSSIAN_C, CV_HAL_THRESH_BINARY>}, src_data, src_step, dst_data, dst_step, width, height, maxValue, C);
+        return invoke(width, height, {adaptiveThreshold<5, CV_HAL_ADAPTIVE_THRESH_GAUSSIAN_C, CV_HAL_THRESH_BINARY>}, src_data, src_step, dst_data, dst_step, width, height, maxValue, C);
     case 500 + CV_HAL_ADAPTIVE_THRESH_GAUSSIAN_C*10 + CV_HAL_THRESH_BINARY_INV:
-        return threshold::invoke(width, height, {adaptiveThreshold<5, CV_HAL_ADAPTIVE_THRESH_GAUSSIAN_C, CV_HAL_THRESH_BINARY_INV>}, src_data, src_step, dst_data, dst_step, width, height, maxValue, C);
+        return invoke(width, height, {adaptiveThreshold<5, CV_HAL_ADAPTIVE_THRESH_GAUSSIAN_C, CV_HAL_THRESH_BINARY_INV>}, src_data, src_step, dst_data, dst_step, width, height, maxValue, C);
     }
 
     return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
-} // cv::cv_hal_rvv::adaptiveThreshold
 
-}}
+#endif // CV_HAL_RVV_1P0_ENABLED
 
-#endif
+}}} /// cv::rvv_hal::imgproc
diff --git a/hal/riscv-rvv/hal_rvv_1p0/warp.hpp b/hal/riscv-rvv/src/imgproc/warp.cpp
similarity index 96%
rename from hal/riscv-rvv/hal_rvv_1p0/warp.hpp
rename to hal/riscv-rvv/src/imgproc/warp.cpp
index cd4218a2cd..2a9cef6934 100644
--- a/hal/riscv-rvv/hal_rvv_1p0/warp.hpp
+++ b/hal/riscv-rvv/src/imgproc/warp.cpp
@@ -4,22 +4,14 @@
 
 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
 
-#ifndef OPENCV_HAL_RVV_WARP_HPP_INCLUDED
-#define OPENCV_HAL_RVV_WARP_HPP_INCLUDED
+#include "rvv_hal.hpp"
+#include "common.hpp"
 
-#include <riscv_vector.h>
+namespace cv { namespace rvv_hal { namespace imgproc {
 
-namespace cv { namespace cv_hal_rvv {
+#if CV_HAL_RVV_1P0_ENABLED
 
-namespace remap {
-
-// BUG: https://github.com/opencv/opencv/issues/27279
-// #undef cv_hal_remap32f
-// #define cv_hal_remap32f cv::cv_hal_rvv::remap::remap32f
-// #undef cv_hal_remap32fc2
-// #define cv_hal_remap32fc2 cv::cv_hal_rvv::remap::remap32fc2
-// #undef cv_hal_remap16s
-// #define cv_hal_remap16s cv::cv_hal_rvv::remap::remap16s
+namespace {
 
 class RemapInvoker : public ParallelLoopBody
 {
@@ -794,6 +786,8 @@ static inline int remap32fC4(int start, int end, const uchar *src_data, size_t s
     return CV_HAL_ERROR_OK;
 }
 
+} // anonymous
+
 // the algorithm is copied from 3rdparty/carotene/src/remap.cpp,
 // in the function void CAROTENE_NS::remapNearestNeighbor and void CAROTENE_NS::remapLinear
 template<bool s16 = false>
@@ -880,17 +874,6 @@ inline int remap16s(int src_type, const uchar *src_data, size_t src_step, int sr
         return CV_HAL_ERROR_NOT_IMPLEMENTED;
     return remap32f<true>(src_type, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, dst_height, reinterpret_cast<float*>(mapx), mapx_step, reinterpret_cast<float*>(mapy), mapy_step, interpolation, border_type, border_value);
 }
-} // cv::cv_hal_rvv::remap
-
-namespace warp {
-
-// BUG: https://github.com/opencv/opencv/issues/27280
-//#undef cv_hal_warpAffine
-//#define cv_hal_warpAffine cv::cv_hal_rvv::warp::warpAffine
-
-// BUG: https://github.com/opencv/opencv/issues/27281
-//#undef cv_hal_warpPerspective
-//#define cv_hal_warpPerspective cv::cv_hal_rvv::warp::warpPerspective
 
 template<bool perspective>
 static inline int warpC1(int start, int end, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, const double* M, int interpolation, int borderType, const double* borderValue)
@@ -1162,7 +1145,7 @@ static inline int warpC4(int start, int end, const uchar *src_data, size_t src_s
 
 // the algorithm is copied from 3rdparty/carotene/src/warp_affine.cpp,
 // in the function void CAROTENE_NS::warpAffineNearestNeighbor and void CAROTENE_NS::warpAffineLinear
-inline int warpAffine(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, int dst_height, const double M[6], int interpolation, int borderType, const double borderValue[4])
+int warpAffine(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, int dst_height, const double M[6], int interpolation, int borderType, const double borderValue[4])
 {
     if (src_type != CV_8UC1 && src_type != CV_8UC3 && src_type != CV_8UC4)
         return CV_HAL_ERROR_NOT_IMPLEMENTED;
@@ -1174,11 +1157,11 @@ inline int warpAffine(int src_type, const uchar *src_data, size_t src_step, int
     switch (src_type)
     {
     case CV_8UC1:
-        return remap::invoke(dst_width, dst_height, {warpC1<false>}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, M, interpolation, borderType, borderValue);
+        return invoke(dst_width, dst_height, {warpC1<false>}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, M, interpolation, borderType, borderValue);
     case CV_8UC3:
-        return remap::invoke(dst_width, dst_height, {warpC3<false>}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, M, interpolation, borderType, borderValue);
+        return invoke(dst_width, dst_height, {warpC3<false>}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, M, interpolation, borderType, borderValue);
     case CV_8UC4:
-        return remap::invoke(dst_width, dst_height, {warpC4<false>}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, M, interpolation, borderType, borderValue);
+        return invoke(dst_width, dst_height, {warpC4<false>}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, M, interpolation, borderType, borderValue);
     }
 
     return CV_HAL_ERROR_NOT_IMPLEMENTED;
@@ -1186,7 +1169,7 @@ inline int warpAffine(int src_type, const uchar *src_data, size_t src_step, int
 
 // the algorithm is copied from 3rdparty/carotene/src/warp_perspective.cpp,
 // in the function void CAROTENE_NS::warpPerspectiveNearestNeighbor and void CAROTENE_NS::warpPerspectiveLinear
-inline int warpPerspective(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, int dst_height, const double M[9], int interpolation, int borderType, const double borderValue[4])
+int warpPerspective(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, int dst_height, const double M[9], int interpolation, int borderType, const double borderValue[4])
 {
     if (src_type != CV_8UC1 && src_type != CV_8UC3 && src_type != CV_8UC4)
         return CV_HAL_ERROR_NOT_IMPLEMENTED;
@@ -1198,17 +1181,16 @@ inline int warpPerspective(int src_type, const uchar *src_data, size_t src_step,
     switch (src_type)
     {
     case CV_8UC1:
-        return remap::invoke(dst_width, dst_height, {warpC1<true>}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, M, interpolation, borderType, borderValue);
+        return invoke(dst_width, dst_height, {warpC1<true>}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, M, interpolation, borderType, borderValue);
     case CV_8UC3:
-        return remap::invoke(dst_width, dst_height, {warpC3<true>}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, M, interpolation, borderType, borderValue);
+        return invoke(dst_width, dst_height, {warpC3<true>}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, M, interpolation, borderType, borderValue);
     case CV_8UC4:
-        return remap::invoke(dst_width, dst_height, {warpC4<true>}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, M, interpolation, borderType, borderValue);
+        return invoke(dst_width, dst_height, {warpC4<true>}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, M, interpolation, borderType, borderValue);
     }
 
     return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
-} // cv::cv_hal_rvv::warp
 
-}}
+#endif // CV_HAL_RVV_1P0_ENABLED
 
-#endif
+}}} // cv::rvv_hal::imgproc
diff --git a/hal/riscv-rvv/version/hal_rvv_071.hpp b/hal/riscv-rvv/version/hal_rvv_071.hpp
deleted file mode 100644
index db235d6139..0000000000
--- a/hal/riscv-rvv/version/hal_rvv_071.hpp
+++ /dev/null
@@ -1,109 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.
-
-#ifndef OPENCV_HAL_RVV_071_HPP_INCLUDED
-#define OPENCV_HAL_RVV_071_HPP_INCLUDED
-
-#include <riscv_vector.h>
-
-#include <limits>
-
-namespace cv { namespace cv_hal_rvv {
-
-#undef cv_hal_cvtBGRtoBGR
-#define cv_hal_cvtBGRtoBGR cv::cv_hal_rvv::cvtBGRtoBGR
-
-static const unsigned char index_array_32 [32]
-                        { 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15, 18, 17, 16, 19, 22, 21, 20, 23, 26, 25, 24, 27, 30, 29, 28, 31  };
-
-static const unsigned char index_array_24 [24]
-                        { 2, 1, 0, 5, 4, 3, 8, 7, 6, 11, 10, 9, 14, 13, 12, 17, 16, 15, 20, 19, 18, 23, 22, 21  };
-
-static void vBGRtoBGR(const unsigned char* src, unsigned char * dst, const unsigned char * index, int n, int scn, int dcn, int vsize_pixels, const int vsize)
-{
-    vuint8m2_t vec_index = vle8_v_u8m2(index, vsize);
-
-    int i = 0;
-
-    for ( ; i <= n-vsize; i += vsize_pixels, src += vsize, dst += vsize)
-    {
-        vuint8m2_t vec_src = vle8_v_u8m2(src, vsize);
-        vuint8m2_t vec_dst = vrgather_vv_u8m2(vec_src, vec_index, vsize);
-        vse8_v_u8m2(dst, vec_dst, vsize);
-    }
-
-    for ( ; i < n; i++, src += scn, dst += dcn )
-    {
-        unsigned char t0 = src[0], t1 = src[1], t2 = src[2];
-        dst[2] = t0;
-        dst[1] = t1;
-        dst[0] = t2;
-        if(dcn == 4)
-        {
-            unsigned char d = src[3];
-            dst[3] = d;
-        }
-    }
-}
-
-static void sBGRtoBGR(const unsigned char* src, unsigned char * dst, int n, int scn, int dcn, int bi)
-{
-    for (int i = 0; i < n; i++, src += scn, dst += dcn)
-    {
-        unsigned char t0 = src[0], t1 = src[1], t2 = src[2];
-        dst[bi  ] = t0;
-        dst[1]    = t1;
-        dst[bi^2] = t2;
-        if(dcn == 4)
-        {
-            unsigned char d = scn == 4 ? src[3] : std::numeric_limits<unsigned char>::max();
-            dst[3] = d;
-        }
-    }
-}
-
-static int cvtBGRtoBGR(const unsigned char * src_data, size_t src_step, unsigned char * dst_data, size_t dst_step, int width, int height, int depth, int scn, int dcn, bool swapBlue)
-{
-    if (depth != CV_8U)
-    {
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-    }
-
-    const int blueIdx = swapBlue ? 2 : 0;
-    if (scn == dcn)
-    {
-        if (!swapBlue)
-        {
-            return CV_HAL_ERROR_NOT_IMPLEMENTED;
-        }
-
-        const int vsize_pixels = 8;
-
-        if (scn == 4)
-        {
-            for (int i = 0; i < height; i++, src_data += src_step, dst_data += dst_step)
-            {
-                vBGRtoBGR(src_data, dst_data, index_array_32, width, scn, dcn, vsize_pixels, 32);
-            }
-        }
-        else
-        {
-            for (int i = 0; i < height; i++, src_data += src_step, dst_data += dst_step)
-            {
-                vBGRtoBGR(src_data, dst_data, index_array_24, width, scn, dcn, vsize_pixels, 24);
-            }
-        }
-    }
-    else
-    {
-        for (int i = 0; i < height; i++, src_data += src_step, dst_data += dst_step)
-            sBGRtoBGR(src_data, dst_data, width, scn, dcn, blueIdx);
-    }
-
-    return CV_HAL_ERROR_OK;
-}
-
-}}
-
-#endif
diff --git a/modules/3d/src/solvepnp.cpp b/modules/3d/src/solvepnp.cpp
index a4f03b0269..90ea1c1232 100644
--- a/modules/3d/src/solvepnp.cpp
+++ b/modules/3d/src/solvepnp.cpp
@@ -99,7 +99,8 @@ void drawFrameAxes(InputOutputArray image, InputArray cameraMatrix, InputArray d
     CV_CheckType(type, cn == 1 || cn == 3 || cn == 4,
                  "Number of channels must be 1, 3 or 4" );
 
-    CV_Assert(image.getMat().total() > 0);
+    cv::Mat img = image.getMat();
+    CV_Assert(img.total() > 0);
     CV_Assert(length > 0);
 
     // project axes points
@@ -111,6 +112,18 @@ void drawFrameAxes(InputOutputArray image, InputArray cameraMatrix, InputArray d
     std::vector<Point2f> imagePoints;
     projectPoints(axesPoints, rvec, tvec, cameraMatrix, distCoeffs, imagePoints);
 
+    cv::Rect imageRect(0, 0, img.cols, img.rows);
+    bool allIn = true;
+    for (size_t i = 0; i < imagePoints.size(); i++)
+    {
+        allIn &= imageRect.contains(imagePoints[i]);
+    }
+
+    if (!allIn)
+    {
+        CV_LOG_WARNING(NULL, "Some of projected axes endpoints are out of frame. The drawn axes may be not relaible.");
+    }
+
     // draw axes lines
     line(image, imagePoints[0], imagePoints[1], Scalar(0, 0, 255), thickness);
     line(image, imagePoints[0], imagePoints[2], Scalar(0, 255, 0), thickness);
diff --git a/modules/3d/src/usac.hpp b/modules/3d/src/usac.hpp
index f3dc55b8d7..2c6ab9fee2 100644
--- a/modules/3d/src/usac.hpp
+++ b/modules/3d/src/usac.hpp
@@ -17,7 +17,7 @@ class Error : public Algorithm {
 public:
     // set model to use getError() function
     virtual void setModelParameters (const Mat &model) = 0;
-    // returns error of point wih @point_idx w.r.t. model
+    // returns error of point with @point_idx w.r.t. model
     virtual float getError (int point_idx) const = 0;
     virtual const std::vector<float> &getErrors (const Mat &model) = 0;
 };
diff --git a/modules/calib/test/test_fisheye.cpp b/modules/calib/test/test_fisheye.cpp
index bf091fedd9..5c25a2c642 100644
--- a/modules/calib/test/test_fisheye.cpp
+++ b/modules/calib/test/test_fisheye.cpp
@@ -175,7 +175,7 @@ TEST_F(fisheyeTest, CalibrationWithFixedFocalLength)
     cv::fisheye::calibrate(objectPoints, imagePoints, imageSize, theK, theD,
                            cv::noArray(), cv::noArray(), flag, cv::TermCriteria(3, 20, 1e-6));
 
-    // ensure that CALIB_FIX_FOCAL_LENGTH works and focal lenght has not changed
+    // ensure that CALIB_FIX_FOCAL_LENGTH works and focal length has not changed
     EXPECT_EQ(theK(0,0), K(0,0));
     EXPECT_EQ(theK(1,1), K(1,1));
 
diff --git a/modules/core/include/opencv2/core.hpp b/modules/core/include/opencv2/core.hpp
index 3a02f260d6..3dab066a4c 100644
--- a/modules/core/include/opencv2/core.hpp
+++ b/modules/core/include/opencv2/core.hpp
@@ -1965,8 +1965,8 @@ The function solveCubic finds the real roots of a cubic equation:
 
 The roots are stored in the roots array.
 @param coeffs equation coefficients, an array of 3 or 4 elements.
-@param roots output array of real roots that has 1 or 3 elements.
-@return number of real roots. It can be 0, 1 or 2.
+@param roots output array of real roots that has 0, 1, 2 or 3 elements.
+@return number of real roots. It can be -1 (all real numbers), 0, 1, 2 or 3.
 */
 CV_EXPORTS_W int solveCubic(InputArray coeffs, OutputArray roots);
 
diff --git a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
index 7eeed2ce9b..35fda729dd 100644
--- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
@@ -225,32 +225,30 @@ These operations allow to reorder or recombine elements in one or multiple vecto
 Element-wise binary and unary operations.
 
 - Arithmetics:
-@ref v_add(const v_reg &a, const v_reg &b) "+",
-@ref v_sub(const v_reg &a, const v_reg &b) "-",
-@ref v_mul(const v_reg &a, const v_reg &b) "*",
-@ref v_div(const v_reg &a, const v_reg &b) "/",
+@ref v_add,
+@ref v_sub,
+@ref v_mul,
+@ref v_div,
 @ref v_mul_expand
 
 - Non-saturating arithmetics: @ref v_add_wrap, @ref v_sub_wrap
 
 - Bitwise shifts:
-@ref v_shl(const v_reg &a, int s) "<<",
-@ref v_shr(const v_reg &a, int s) ">>",
 @ref v_shl, @ref v_shr
 
 - Bitwise logic:
-@ref v_and(const v_reg &a, const v_reg &b) "&",
-@ref v_or(const v_reg &a, const v_reg &b) "|",
-@ref v_xor(const v_reg &a, const v_reg &b) "^",
-@ref v_not(const v_reg &a) "~"
+@ref v_and,
+@ref v_or,
+@ref v_xor,
+@ref v_not
 
 - Comparison:
-@ref v_gt(const v_reg &a, const v_reg &b) ">",
-@ref v_ge(const v_reg &a, const v_reg &b) ">=",
-@ref v_lt(const v_reg &a, const v_reg &b) "<",
-@ref v_le(const v_reg &a, const v_reg &b) "<=",
-@ref v_eq(const v_reg &a, const v_reg &b) "==",
-@ref v_ne(const v_reg &a, const v_reg &b) "!="
+@ref v_gt,
+@ref v_ge,
+@ref v_lt,
+@ref v_le,
+@ref v_eq,
+@ref v_ne
 
 - min/max: @ref v_min, @ref v_max
 
diff --git a/modules/core/include/opencv2/core/hal/intrin_legacy_ops.h b/modules/core/include/opencv2/core/hal/intrin_legacy_ops.h
new file mode 100644
index 0000000000..764c245df9
--- /dev/null
+++ b/modules/core/include/opencv2/core/hal/intrin_legacy_ops.h
@@ -0,0 +1,111 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+// This file has been created for compatibility with older versions of Universal Intrinscs
+// Binary operators for vector types has been removed since version 4.11
+// Include this file manually after OpenCV headers if you need these operators
+
+#ifndef OPENCV_HAL_INTRIN_LEGACY_OPS_HPP
+#define OPENCV_HAL_INTRIN_LEGACY_OPS_HPP
+
+#ifdef __OPENCV_BUILD
+#error "Universal Intrinsics operators are deprecated and should not be used in OpenCV library"
+#endif
+
+#ifdef __riscv
+#warning "Operators might conflict with built-in functions on RISC-V platform"
+#endif
+
+#if defined(CV_VERSION) && CV_VERSION_MAJOR == 4 && CV_VERSION_MINOR < 9
+#warning "Older versions of OpenCV (<4.9) already have Universal Intrinscs operators"
+#endif
+
+
+namespace cv { namespace hal {
+
+#define BIN_OP(OP, FUN) \
+template <typename R> R operator OP (const R & lhs, const R & rhs) { return FUN(lhs, rhs); }
+
+#define BIN_A_OP(OP, FUN) \
+template <typename R> R & operator OP (R & res, const R & val) { res = FUN(res, val); return res; }
+
+#define UN_OP(OP, FUN) \
+template <typename R> R operator OP (const R & val) { return FUN(val); }
+
+BIN_OP(+, v_add)
+BIN_OP(-, v_sub)
+BIN_OP(*, v_mul)
+BIN_OP(/, v_div)
+BIN_OP(&, v_and)
+BIN_OP(|, v_or)
+BIN_OP(^, v_xor)
+
+BIN_OP(==, v_eq)
+BIN_OP(!=, v_ne)
+BIN_OP(<, v_lt)
+BIN_OP(>, v_gt)
+BIN_OP(<=, v_le)
+BIN_OP(>=, v_ge)
+
+BIN_A_OP(+=, v_add)
+BIN_A_OP(-=, v_sub)
+BIN_A_OP(*=, v_mul)
+BIN_A_OP(/=, v_div)
+BIN_A_OP(&=, v_and)
+BIN_A_OP(|=, v_or)
+BIN_A_OP(^=, v_xor)
+
+UN_OP(~, v_not)
+
+// TODO: shift operators?
+
+}} // cv::hal::
+
+//==============================================================================
+
+#ifdef OPENCV_ENABLE_INLINE_INTRIN_OPERATOR_TEST
+
+namespace cv { namespace hal {
+
+inline static void opencv_operator_compile_test()
+{
+    using namespace cv;
+    v_float32 a, b, c;
+    uint8_t shift = 1;
+    a = b + c;
+    a = b - c;
+    a = b * c;
+    a = b / c;
+    a = b & c;
+    a = b | c;
+    a = b ^ c;
+    // a = b >> shift;
+    // a = b << shift;
+
+    a = (b == c);
+    a = (b != c);
+    a = (b < c);}}
+    a = (b > c);
+    a = (b <= c);
+    a = (b >= c);
+
+    a += b;
+    a -= b;
+    a *= b;
+    a /= b;
+    a &= b;
+    a |= b;
+    a ^= b;
+    // a <<= shift;
+    // a >>= shift;
+
+    a = ~b;
+}
+
+}} // cv::hal::
+
+#endif
+
+
+#endif // OPENCV_HAL_INTRIN_LEGACY_OPS_HPP
diff --git a/modules/core/include/opencv2/core/mat.inl.hpp b/modules/core/include/opencv2/core/mat.inl.hpp
index 59df23e83b..be1e192e84 100644
--- a/modules/core/include/opencv2/core/mat.inl.hpp
+++ b/modules/core/include/opencv2/core/mat.inl.hpp
@@ -3184,6 +3184,12 @@ Mat_<_Tp>& Mat_<_Tp>::operator = (const MatExpr& e)
     return *this;
 }
 
+template<typename _Tp> inline
+MatExpr Mat_<_Tp>::zeros(int _ndims, const int* _sizes)
+{
+    return Mat::zeros(_ndims, _sizes, traits::Type<_Tp>::value);
+}
+
 template<typename _Tp> inline
 MatExpr Mat_<_Tp>::zeros(int rows, int cols)
 {
diff --git a/modules/core/include/opencv2/core/private.cuda.hpp b/modules/core/include/opencv2/core/private.cuda.hpp
index 39f2ddcdeb..4250f61033 100644
--- a/modules/core/include/opencv2/core/private.cuda.hpp
+++ b/modules/core/include/opencv2/core/private.cuda.hpp
@@ -147,7 +147,23 @@ namespace cv { namespace cuda
         inline explicit NppStreamHandler(cudaStream_t newStream)
         {
             nppStreamContext = {};
-            nppSafeCall(nppGetStreamContext(&nppStreamContext));
+            #if CUDA_VERSION < 12090
+                nppSafeCall(nppGetStreamContext(&nppStreamContext));
+            #else
+                int device = 0;
+                cudaSafeCall(cudaGetDevice(&device));
+
+                cudaDeviceProp prop{};
+                cudaSafeCall(cudaGetDeviceProperties(&prop, device));
+
+                nppStreamContext.nCudaDeviceId = device;
+                nppStreamContext.nMultiProcessorCount = prop.multiProcessorCount;
+                nppStreamContext.nMaxThreadsPerMultiProcessor = prop.maxThreadsPerMultiProcessor;
+                nppStreamContext.nMaxThreadsPerBlock = prop.maxThreadsPerBlock;
+                nppStreamContext.nSharedMemPerBlock = prop.sharedMemPerBlock;
+                nppStreamContext.nCudaDevAttrComputeCapabilityMajor = prop.major;
+                nppStreamContext.nCudaDevAttrComputeCapabilityMinor = prop.minor;
+            #endif
             nppStreamContext.hStream = newStream;
             cudaSafeCall(cudaStreamGetFlags(nppStreamContext.hStream, &nppStreamContext.nStreamFlags));
         }
diff --git a/modules/core/perf/opencl/perf_arithm.cpp b/modules/core/perf/opencl/perf_arithm.cpp
index ad5ae671d0..7c2f0a02b7 100644
--- a/modules/core/perf/opencl/perf_arithm.cpp
+++ b/modules/core/perf/opencl/perf_arithm.cpp
@@ -694,7 +694,7 @@ OCL_PERF_TEST_P(PowFixture, Pow, ::testing::Combine(
 
 ///////////// iPow ////////////////////////
 OCL_PERF_TEST_P(PowFixture, iPow, ::testing::Combine(
-                OCL_TEST_SIZES, OCL_PERF_ENUM(CV_8UC1, CV_8SC1,CV_16UC1,CV_16SC1,CV_32SC1)))
+                OCL_TEST_SIZES, OCL_PERF_ENUM(CV_8UC1, CV_8UC3, CV_8SC1, CV_16UC1, CV_16SC1, CV_32SC1, CV_32FC1, CV_64FC1)))
 {
     const Size_MatType_t params = GetParam();
     const Size srcSize = get<0>(params);
@@ -706,7 +706,7 @@ OCL_PERF_TEST_P(PowFixture, iPow, ::testing::Combine(
     randu(src, 0, 100);
     declare.in(src).out(dst);
 
-    OCL_TEST_CYCLE() cv::pow(src, 7.0, dst);
+    OCL_TEST_CYCLE() cv::pow(src, 3, dst);
 
     SANITY_CHECK_NOTHING();
 }
diff --git a/modules/core/src/hal_replacement.hpp b/modules/core/src/hal_replacement.hpp
index 0c9254a872..a93a3b5291 100644
--- a/modules/core/src/hal_replacement.hpp
+++ b/modules/core/src/hal_replacement.hpp
@@ -1223,8 +1223,22 @@ inline int hal_ni_copyToMasked(const uchar* src_data, size_t src_step, uchar* ds
 #define cv_hal_copyToMasked hal_ni_copyToMasked
 //! @endcond
 
-//! @}
+/**
+ @ brief sum
+ @param src_data Source image data
+ @param src_step Source image step
+ @param src_type Source image type
+ @param width, height Source image dimensions
+ @param result Pointer to save the sum result to.
+ */
+inline int hal_ni_sum(const uchar *src_data, size_t src_step, int src_type, int width, int height, double *result)
+{ return CV_HAL_ERROR_NOT_IMPLEMENTED; }
 
+//! @cond IGNORED
+#define cv_hal_sum hal_ni_sum
+//! @endcond
+
+//! @}
 
 #if defined(__clang__)
 #pragma clang diagnostic pop
diff --git a/modules/core/src/mathfuncs.cpp b/modules/core/src/mathfuncs.cpp
index cc24ebbb31..b2ea6aa46e 100644
--- a/modules/core/src/mathfuncs.cpp
+++ b/modules/core/src/mathfuncs.cpp
@@ -938,9 +938,40 @@ static bool ocl_pow(InputArray _src, double power, OutputArray _dst,
     bool issqrt = std::abs(power - 0.5) < DBL_EPSILON;
     const char * const op = issqrt ? "OP_SQRT" : is_ipower ? "OP_POWN" : "OP_POW";
 
+    // Note: channels are unrolled
+
+    std::string extra_opts ="";
+    if (is_ipower)
+    {
+        int wdepth = CV_32F;
+        if (depth == CV_64F)
+            wdepth = CV_64F;
+        else if (depth == CV_16F)
+            wdepth = CV_16F;
+
+        char cvt[2][50];
+        extra_opts = format(
+            " -D srcT1=%s -DsrcT1_C1=%s"
+            " -D srcT2=int -D workST=int"
+            " -D workT=%s -D wdepth=%d -D convertToWT1=%s"
+            " -D convertToDT=%s"
+            " -D workT1=%s",
+            ocl::typeToStr(CV_MAKE_TYPE(depth, 1)),
+            ocl::typeToStr(CV_MAKE_TYPE(depth, 1)),
+            ocl::typeToStr(CV_MAKE_TYPE(wdepth, 1)),
+            wdepth,
+            ocl::convertTypeStr(depth, wdepth, 1, cvt[0], sizeof(cvt[0])),
+            ocl::convertTypeStr(wdepth, depth, 1, cvt[1], sizeof(cvt[1])),
+            ocl::typeToStr(wdepth)
+        );
+    }
+
     ocl::Kernel k("KF", ocl::core::arithm_oclsrc,
-                  format("-D dstT=%s -D DEPTH_dst=%d -D rowsPerWI=%d -D %s -D UNARY_OP%s",
-                         ocl::typeToStr(depth), depth, rowsPerWI, op,
+                  format("-D cn=%d -D dstT=%s -D dstT_C1=%s -D DEPTH_dst=%d -D rowsPerWI=%d -D %s%s%s%s",
+                         1,
+                         ocl::typeToStr(depth), ocl::typeToStr(depth), depth, rowsPerWI, op,
+                         " -D UNARY_OP=1",
+                         extra_opts.empty() ? "" : extra_opts.c_str(),
                          doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
     if (k.empty())
         return false;
@@ -1396,7 +1427,7 @@ int cv::solveCubic( InputArray _coeffs, OutputArray _roots )
     {
         if( a1 == 0 )
         {
-            if( a2 == 0 )
+            if( a2 == 0 ) // constant
                 n = a3 == 0 ? -1 : 0;
             else
             {
@@ -1430,15 +1461,23 @@ int cv::solveCubic( InputArray _coeffs, OutputArray _roots )
     }
     else
     {
+        // cubic equation
         a0 = 1./a0;
         a1 *= a0;
         a2 *= a0;
         a3 *= a0;
 
         double Q = (a1 * a1 - 3 * a2) * (1./9);
-        double R = (2 * a1 * a1 * a1 - 9 * a1 * a2 + 27 * a3) * (1./54);
+        double R = (a1 * (2 * a1 * a1 - 9 * a2) + 27 * a3) * (1./54);
         double Qcubed = Q * Q * Q;
-        double d = Qcubed - R * R;
+        /*
+          Here we expand expression `Qcubed - R * R` for `d` variable
+          to reduce common terms `a1^6 / 729` and `-a1^4 * a2 / 81`
+          and thus decrease rounding error (in case of quite big coefficients).
+
+          And then we additionally group terms to further reduce rounding error.
+        */
+        double d = (a1 * a1 * (a2 * a2 - 4 * a1 * a3) + 2 * a2 * (9 * a1 * a3 - 2 * a2 * a2) - 27 * a3 * a3) * (1./108);
 
         if( d > 0 )
         {
diff --git a/modules/core/src/norm.dispatch.cpp b/modules/core/src/norm.dispatch.cpp
index 5383954900..b673f45603 100644
--- a/modules/core/src/norm.dispatch.cpp
+++ b/modules/core/src/norm.dispatch.cpp
@@ -559,7 +559,7 @@ double norm( InputArray _src1, InputArray _src2, int normType, InputArray _mask
               ((normType == NORM_HAMMING || normType == NORM_HAMMING2) && src1.type() == CV_8U) );
 
     NormDiffFunc func = getNormDiffFunc(normType >> 1, depth);
-    CV_Assert( func != 0 );
+    CV_Assert( (normType >> 1) >= 3 || func != 0 );
 
     if( src1.isContinuous() && src2.isContinuous() && mask.empty() )
     {
diff --git a/modules/core/src/norm.simd.hpp b/modules/core/src/norm.simd.hpp
index 32fb19f1b5..aadd31588e 100644
--- a/modules/core/src/norm.simd.hpp
+++ b/modules/core/src/norm.simd.hpp
@@ -1581,6 +1581,7 @@ NormDiffFunc getNormDiffFunc(int normType, int depth)
             0
         },
     };
+    if (normType >= 3 || normType < 0) return nullptr;
 
     return normDiffTab[normType][depth];
 }
diff --git a/modules/core/src/opencl/arithm.cl b/modules/core/src/opencl/arithm.cl
index 301cea9f98..bbf3b83c2c 100644
--- a/modules/core/src/opencl/arithm.cl
+++ b/modules/core/src/opencl/arithm.cl
@@ -80,6 +80,10 @@
 #error "Kernel configuration error: ambiguous 'depth' value is defined, use 'DEPTH_dst' instead"
 #endif
 
+#define CAT__(x, y) x ## y
+#define CAT_(x, y) CAT__(x, y)
+#define CAT(x, y) CAT_(x, y)
+
 
 #if DEPTH_dst < 5 /* CV_32F */
 #define CV_DST_TYPE_IS_INTEGER
@@ -325,9 +329,12 @@
 #define PROCESS_ELEM storedst(pow(srcelem1, srcelem2))
 
 #elif defined OP_POWN
-#undef workT
-#define workT int
-#define PROCESS_ELEM storedst(pown(srcelem1, srcelem2))
+#if cn > 1
+#define PROCESS_INIT CAT(int, cn) powi = (CAT(int, cn))srcelem2;
+#else // cn
+#define PROCESS_INIT int powi = srcelem2;
+#endif
+#define PROCESS_ELEM storedst(convertToDT(pown(srcelem1, powi)))
 
 #elif defined OP_SQRT
 #if CV_DST_TYPE_FIT_32F
@@ -469,7 +476,7 @@
     #define srcelem2 srcelem2_
 #endif
 
-#if cn == 3
+#if !defined(PROCESS_INIT) && cn == 3
 #undef srcelem2
 #define srcelem2 (workT)(srcelem2_.x, srcelem2_.y, srcelem2_.z)
 #endif
@@ -517,6 +524,10 @@ __kernel void KF(__global const uchar * srcptr1, int srcstep1, int srcoffset1,
     int x = get_global_id(0);
     int y0 = get_global_id(1) * rowsPerWI;
 
+#ifdef PROCESS_INIT
+    PROCESS_INIT
+#endif
+
     if (x < cols)
     {
         int mask_index = mad24(y0, maskstep, x + maskoffset);
@@ -542,6 +553,10 @@ __kernel void KF(__global const uchar * srcptr1, int srcstep1, int srcoffset1,
     int x = get_global_id(0);
     int y0 = get_global_id(1) * rowsPerWI;
 
+#ifdef PROCESS_INIT
+    PROCESS_INIT
+#endif
+
     if (x < cols)
     {
         int src1_index = mad24(y0, srcstep1, mad24(x, (int)sizeof(srcT1_C1) * cn, srcoffset1));
@@ -564,6 +579,10 @@ __kernel void KF(__global const uchar * srcptr1, int srcstep1, int srcoffset1,
     int x = get_global_id(0);
     int y0 = get_global_id(1) * rowsPerWI;
 
+#ifdef PROCESS_INIT
+    PROCESS_INIT
+#endif
+
     if (x < cols)
     {
         int mask_index = mad24(y0, maskstep, x + maskoffset);
diff --git a/modules/core/src/sum.dispatch.cpp b/modules/core/src/sum.dispatch.cpp
index e56e10117f..6d2aa0b188 100644
--- a/modules/core/src/sum.dispatch.cpp
+++ b/modules/core/src/sum.dispatch.cpp
@@ -10,14 +10,6 @@
 #include "sum.simd.hpp"
 #include "sum.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content
 
-#ifndef OPENCV_IPP_SUM
-#undef HAVE_IPP
-#undef CV_IPP_RUN_FAST
-#define CV_IPP_RUN_FAST(f, ...)
-#undef CV_IPP_RUN
-#define CV_IPP_RUN(c, f, ...)
-#endif // OPENCV_IPP_SUM
-
 namespace cv
 {
 
@@ -126,95 +118,45 @@ bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask,
 
 #endif
 
-#ifdef HAVE_IPP
-static bool ipp_sum(Mat &src, Scalar &_res)
-{
-    CV_INSTRUMENT_REGION_IPP();
-
-#if IPP_VERSION_X100 >= 700
-    int cn = src.channels();
-    if (cn > 4)
-        return false;
-    size_t total_size = src.total();
-    int rows = src.size[0], cols = rows ? (int)(total_size/rows) : 0;
-    if( src.dims <= 2 || (src.isContinuous() && cols > 0 && (size_t)rows*cols == total_size) )
-    {
-        IppiSize sz = { cols, rows };
-        int type = src.type();
-        typedef IppStatus (CV_STDCALL* ippiSumFuncHint)(const void*, int, IppiSize, double *, IppHintAlgorithm);
-        typedef IppStatus (CV_STDCALL* ippiSumFuncNoHint)(const void*, int, IppiSize, double *);
-        ippiSumFuncHint ippiSumHint =
-            type == CV_32FC1 ? (ippiSumFuncHint)ippiSum_32f_C1R :
-            type == CV_32FC3 ? (ippiSumFuncHint)ippiSum_32f_C3R :
-            type == CV_32FC4 ? (ippiSumFuncHint)ippiSum_32f_C4R :
-            0;
-        ippiSumFuncNoHint ippiSum =
-            type == CV_8UC1 ? (ippiSumFuncNoHint)ippiSum_8u_C1R :
-            type == CV_8UC3 ? (ippiSumFuncNoHint)ippiSum_8u_C3R :
-            type == CV_8UC4 ? (ippiSumFuncNoHint)ippiSum_8u_C4R :
-            type == CV_16UC1 ? (ippiSumFuncNoHint)ippiSum_16u_C1R :
-            type == CV_16UC3 ? (ippiSumFuncNoHint)ippiSum_16u_C3R :
-            type == CV_16UC4 ? (ippiSumFuncNoHint)ippiSum_16u_C4R :
-            type == CV_16SC1 ? (ippiSumFuncNoHint)ippiSum_16s_C1R :
-            type == CV_16SC3 ? (ippiSumFuncNoHint)ippiSum_16s_C3R :
-            type == CV_16SC4 ? (ippiSumFuncNoHint)ippiSum_16s_C4R :
-            0;
-        CV_Assert(!ippiSumHint || !ippiSum);
-        if( ippiSumHint || ippiSum )
-        {
-            Ipp64f res[4];
-            IppStatus ret = ippiSumHint ?
-                            CV_INSTRUMENT_FUN_IPP(ippiSumHint, src.ptr(), (int)src.step[0], sz, res, ippAlgHintAccurate) :
-                            CV_INSTRUMENT_FUN_IPP(ippiSum, src.ptr(), (int)src.step[0], sz, res);
-            if( ret >= 0 )
-            {
-                for( int i = 0; i < cn; i++ )
-                    _res[i] = res[i];
-                return true;
-            }
-        }
-    }
-#else
-    CV_UNUSED(src); CV_UNUSED(_res);
-#endif
-    return false;
-}
-#endif
-
 Scalar sum(InputArray _src)
 {
     CV_INSTRUMENT_REGION();
 
-#if defined HAVE_OPENCL || defined HAVE_IPP
     Scalar _res;
-#endif
-
 #ifdef HAVE_OPENCL
     CV_OCL_RUN_(OCL_PERFORMANCE_CHECK(_src.isUMat()) && _src.dims() <= 2,
                 ocl_sum(_src, _res, OCL_OP_SUM),
-                _res)
+                _res);
 #endif
 
     Mat src = _src.getMat();
-    CV_IPP_RUN(IPP_VERSION_X100 >= 700, ipp_sum(src, _res), _res);
+    int cn = src.channels();
+    CV_CheckLE( cn, 4, "cv::sum does not support more than 4 channels" );
 
-    int k, cn = src.channels(), depth = src.depth();
+    if (_src.dims() <= 2)
+    {
+        CALL_HAL_RET2(sum, cv_hal_sum, _res, src.data, src.step, src.type(), src.cols, src.rows, &_res[0]);
+    }
+    else if (_src.isContinuous())
+    {
+        CALL_HAL_RET2(sum, cv_hal_sum, _res, src.data, 0, src.type(), (int)src.total(), 1, &_res[0]);
+    }
+
+    int k, depth = src.depth();
     SumFunc func = getSumFunc(depth);
     if (func == nullptr) {
         if (depth == CV_Bool && cn == 1)
             return Scalar((double)countNonZero(src));
         CV_Error(Error::StsNotImplemented, "");
     }
-    CV_Assert( cn <= 4 && func != 0 );
 
     const Mat* arrays[] = {&src, 0};
     uchar* ptrs[1] = {};
     NAryMatIterator it(arrays, ptrs);
-    Scalar s;
     int total = (int)it.size, blockSize = total, partialBlockSize = 0;
     int j, count = 0;
     int _buf[CV_CN_MAX];
-    int* buf = (int*)&s[0];
+    int* buf = (int*)&_res[0];
     size_t esz = 0;
     bool partialSumIsInt = depth < CV_32S;
     bool blockSum = partialSumIsInt || depth == CV_16F || depth == CV_16BF;
@@ -241,13 +183,13 @@ Scalar sum(InputArray _src)
                 if (partialSumIsInt) {
                     for( k = 0; k < cn; k++ )
                     {
-                        s[k] += buf[k];
+                        _res[k] += buf[k];
                         buf[k] = 0;
                     }
                 } else {
                     for( k = 0; k < cn; k++ )
                     {
-                        s[k] += ((float*)buf)[k];
+                        _res[k] += ((float*)buf)[k];
                         buf[k] = 0;
                     }
                 }
@@ -256,7 +198,7 @@ Scalar sum(InputArray _src)
             ptrs[0] += bsz*esz;
         }
     }
-    return s;
+    return _res;
 }
 
 } // namespace
diff --git a/modules/core/test/ocl/test_arithm.cpp b/modules/core/test/ocl/test_arithm.cpp
index bb2a273daa..cbf31ff3a0 100644
--- a/modules/core/test/ocl/test_arithm.cpp
+++ b/modules/core/test/ocl/test_arithm.cpp
@@ -132,19 +132,25 @@ PARAM_TEST_CASE(ArithmTestBase, MatDepth, Channels, bool)
         use_roi = GET_PARAM(2);
     }
 
-    void generateTestData(bool with_val_in_range = false)
+    void generateTestData(bool with_val_in_range = false,
+        double minVal1 = std::numeric_limits<double>::quiet_NaN(), double maxVal1 = std::numeric_limits<double>::quiet_NaN(),
+        double minVal2 = std::numeric_limits<double>::quiet_NaN(), double maxVal2 = std::numeric_limits<double>::quiet_NaN()
+    )
     {
         const int type = CV_MAKE_TYPE(depth, cn);
 
-        double minV = cvtest::getMinVal(type);
-        double maxV = cvtest::getMaxVal(type);
+        double minV1 = cvIsNaN(minVal1) ? 2 : minVal1;
+        double maxV1 = cvIsNaN(maxVal1) ? 11 : maxVal1;
+
+        double minV2 = cvIsNaN(minVal2) ? std::max(-1540., cvtest::getMinVal(type)) : minVal2;
+        double maxV2 = cvIsNaN(maxVal2) ? std::min(1740., cvtest::getMaxVal(type)) : maxVal2;
 
         Size roiSize = randomSize(1, MAX_VALUE);
         Border src1Border = randomBorder(0, use_roi ? MAX_VALUE : 0);
-        randomSubMat(src1, src1_roi, roiSize, src1Border, type, 2, 11); // FIXIT: Test with minV, maxV
+        randomSubMat(src1, src1_roi, roiSize, src1Border, type, minV1, maxV1); // FIXIT: Test with minV, maxV
 
         Border src2Border = randomBorder(0, use_roi ? MAX_VALUE : 0);
-        randomSubMat(src2, src2_roi, roiSize, src2Border, type, std::max(-1540., minV), std::min(1740., maxV));
+        randomSubMat(src2, src2_roi, roiSize, src2Border, type, minV2, maxV2);
 
         Border dst1Border = randomBorder(0, use_roi ? MAX_VALUE : 0);
         randomSubMat(dst1, dst1_roi, roiSize, dst1Border, type, 5, 16);
@@ -162,8 +168,8 @@ PARAM_TEST_CASE(ArithmTestBase, MatDepth, Channels, bool)
 
         if (with_val_in_range)
         {
-            val_in_range = cv::Scalar(rng.uniform(minV, maxV), rng.uniform(minV, maxV),
-                                      rng.uniform(minV, maxV), rng.uniform(minV, maxV));
+            val_in_range = cv::Scalar(rng.uniform(minV1, maxV1), rng.uniform(minV1, maxV1),
+                                      rng.uniform(minV1, maxV1), rng.uniform(minV1, maxV1));
         }
 
         UMAT_UPLOAD_INPUT_PARAMETER(src1);
@@ -844,14 +850,30 @@ OCL_TEST_P(Pow, Mat)
     for (int j = 0; j < 1/*test_loop_times*/; j++)
         for (int k = 0, size = sizeof(pows) / sizeof(double); k < size; ++k)
         {
-            SCOPED_TRACE(pows[k]);
+            SCOPED_TRACE(cv::format("POW=%g", pows[k]));
 
-            generateTestData();
+            generateTestData(false, 1, 3);
 
             OCL_OFF(cv::pow(src1_roi, pows[k], dst1_roi));
             OCL_ON(cv::pow(usrc1_roi, pows[k], udst1_roi));
 
             OCL_EXPECT_MATS_NEAR_RELATIVE(dst1, 1e-5);
+
+            if (cvtest::debugLevel >= 100)
+            {
+                cv::Rect roi(0, 0, 4, 4);
+                std::cout << src1_roi(roi) << std::endl;
+                std::cout << dst1_roi(roi) << std::endl;
+                std::cout << udst1_roi(roi) << std::endl;
+
+                Mat diff;
+                cv::absdiff(dst1_roi, udst1_roi, diff);
+                std::cout << std::endl << diff(roi) << std::endl;
+
+                std::cout << std::endl << dst1_roi << std::endl;
+                std::cout << std::endl << udst1_roi << std::endl;
+                std::cout << std::endl << diff << std::endl;
+            }
         }
 }
 
diff --git a/modules/core/test/test_intrin_utils.hpp b/modules/core/test/test_intrin_utils.hpp
index 387697b9aa..5479f41726 100644
--- a/modules/core/test/test_intrin_utils.hpp
+++ b/modules/core/test/test_intrin_utils.hpp
@@ -24,6 +24,19 @@ void test_hal_intrin_float16();
 
 //==================================================================================================
 
+#if defined (__clang__) && defined(__has_warning)
+    #if __has_warning("-Wmaybe-uninitialized")
+        #define CV_DISABLE_GCC_MAYBE_UNINITIALIZED_WARNINGS
+    #endif
+#elif defined (__GNUC__) // in case of gcc, it does not have macro __has_warning
+    #define CV_DISABLE_GCC_MAYBE_UNINITIALIZED_WARNINGS
+#endif
+
+#if defined (CV_DISABLE_GCC_MAYBE_UNINITIALIZED_WARNINGS)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#endif
+
 template <typename R> struct Data
 {
     typedef typename VTraits<R>::lane_type LaneType;
@@ -2597,6 +2610,10 @@ void test_hal_intrin_float16()
 #endif
 }
 
+#if defined (CV_DISABLE_GCC_MAYBE_UNINITIALIZED_WARNINGS)
+#pragma GCC diagnostic pop
+#endif
+
 #endif //CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
 
 //CV_CPU_OPTIMIZATION_NAMESPACE_END
diff --git a/modules/core/test/test_mat.cpp b/modules/core/test/test_mat.cpp
index 0fea9fa13f..9166e1a0c5 100644
--- a/modules/core/test/test_mat.cpp
+++ b/modules/core/test/test_mat.cpp
@@ -1323,6 +1323,13 @@ TEST(Core_Mat, copyMakeBoderUndefinedBehavior)
     EXPECT_EQ(0, cv::norm(src.col(2), dst(Rect(5,1,1,4))));
 }
 
+TEST(Core_Mat, zeros)
+{
+  // Should not fail during linkage.
+  const int dims[] = {2, 2, 4};
+  cv::Mat1f mat = cv::Mat1f::zeros(3, dims);
+}
+
 TEST(Core_Matx, fromMat_)
 {
     Mat_<double> a = (Mat_<double>(2,2) << 10, 11, 12, 13);
diff --git a/modules/core/test/test_math.cpp b/modules/core/test/test_math.cpp
index 5a3e1a5a36..9242070339 100644
--- a/modules/core/test/test_math.cpp
+++ b/modules/core/test/test_math.cpp
@@ -1734,7 +1734,7 @@ static void checkRoot(Mat& r, T re, T im)
 {
     for (int i = 0; i < r.cols*r.rows; i++)
     {
-        Vec<T, 2> v = *(Vec<T, 2>*)r.ptr(i);
+        Vec<T, 2>& v = *(Vec<T, 2>*)r.ptr(i);
         if (fabs(re - v[0]) < 1e-6 && fabs(im - v[1]) < 1e-6)
         {
             v[0] = std::numeric_limits<T>::quiet_NaN();
@@ -1744,6 +1744,179 @@ static void checkRoot(Mat& r, T re, T im)
     }
     GTEST_NONFATAL_FAILURE_("Can't find root") << "(" << re << ", " << im << ")";
 }
+
+TEST(Core_SolveCubicConstant, accuracy)
+{
+    {
+        const std::vector<double> coeffs{0., 0., 0., 1.};
+        std::vector<double> roots;
+        const auto num_roots = solveCubic(coeffs, roots);
+
+        EXPECT_EQ(num_roots, 0);
+    }
+
+    {
+        const std::vector<double> coeffs{0., 0., 0., 0.};
+        std::vector<double> roots;
+        const auto num_roots = solveCubic(coeffs, roots);
+
+        EXPECT_EQ(num_roots, -1);
+    }
+}
+
+TEST(Core_SolveCubicLinear, accuracy)
+{
+    const std::vector<double> coeffs{0., 0., 2., -2.};
+    std::vector<double> roots;
+    const auto num_roots = solveCubic(coeffs, roots);
+
+    EXPECT_EQ(num_roots, 1);
+    EXPECT_EQ(roots[0], 1.);
+}
+
+TEST(Core_SolveCubicQuadratic, accuracy)
+{
+    {
+        const std::vector<double> coeffs{0., 2., -4., 4.};
+        std::vector<double> roots;
+        const auto num_roots = solveCubic(coeffs, roots);
+
+        EXPECT_EQ(num_roots, 0);
+    }
+
+    {
+        const std::vector<double> coeffs{0., 2., -4., 2.};
+        std::vector<double> roots;
+        const auto num_roots = solveCubic(coeffs, roots);
+
+        EXPECT_EQ(num_roots, 1);
+        EXPECT_EQ(roots[0], 1.);
+    }
+
+    {
+        const std::vector<double> coeffs{0., 2., -6., 4.};
+        std::vector<double> roots;
+        const auto num_roots = solveCubic(coeffs, roots);
+
+        EXPECT_EQ(num_roots, 2);
+        EXPECT_EQ(roots[0], 2.);
+        EXPECT_EQ(roots[1], 1.);
+    }
+}
+
+TEST(Core_SolveCubicCubic, accuracy)
+{
+    {
+        const std::vector<double> coeffs{2., -6., 6., -2.};
+        std::vector<double> roots;
+        const auto num_roots = solveCubic(coeffs, roots);
+
+        EXPECT_EQ(num_roots, 1);
+        EXPECT_EQ(roots[0], 1.);
+    }
+
+    {
+        const std::vector<double> coeffs{2., -10., 24., -16.};
+        std::vector<double> roots;
+        const auto num_roots = solveCubic(coeffs, roots);
+
+        EXPECT_EQ(num_roots, 1);
+        EXPECT_NEAR(roots[0], 1., 1e-8);
+    }
+
+    {
+        const std::vector<double> coeffs{2., -10., 16., -8.};
+        std::vector<double> roots;
+        const auto num_roots = solveCubic(coeffs, roots);
+
+        EXPECT_TRUE(num_roots == 2 || num_roots == 3);
+        EXPECT_NEAR(roots[0], 1., 1e-8);
+        EXPECT_NEAR(roots[1], 2., 1e-8);
+        if (num_roots == 3)
+        {
+            EXPECT_NEAR(roots[2], 2., 1e-8);
+        }
+    }
+
+    {
+        const std::vector<double> coeffs{2., -12., 22., -12.};
+        std::vector<double> roots;
+        const auto num_roots = solveCubic(coeffs, roots);
+
+        EXPECT_EQ(num_roots, 3);
+        EXPECT_NEAR(roots[0], 1., 1e-8);
+        EXPECT_NEAR(roots[1], 3., 1e-8);
+        EXPECT_NEAR(roots[2], 2., 1e-8);
+    }
+}
+
+TEST(Core_SolveCubicNormalizedCubic, accuracy)
+{
+    {
+        const std::vector<double> coeffs{-3., 3., -1.};
+        std::vector<double> roots;
+        const auto num_roots = solveCubic(coeffs, roots);
+
+        EXPECT_EQ(num_roots, 1);
+        EXPECT_EQ(roots[0], 1.);
+    }
+
+    {
+        const std::vector<double> coeffs{-5., 12., -8.};
+        std::vector<double> roots;
+        const auto num_roots = solveCubic(coeffs, roots);
+
+        EXPECT_EQ(num_roots, 1);
+        EXPECT_NEAR(roots[0], 1., 1e-8);
+    }
+
+    {
+        const std::vector<double> coeffs{-5., 8., -4.};
+        std::vector<double> roots;
+        const auto num_roots = solveCubic(coeffs, roots);
+
+        EXPECT_TRUE(num_roots == 2 || num_roots == 3);
+        EXPECT_NEAR(roots[0], 1., 1e-8);
+        EXPECT_NEAR(roots[1], 2., 1e-8);
+        if (num_roots == 3)
+        {
+            EXPECT_NEAR(roots[2], 2., 1e-8);
+        }
+    }
+
+    {
+        const std::vector<double> coeffs{-6., 11., -6.};
+        std::vector<double> roots;
+        const auto num_roots = solveCubic(coeffs, roots);
+
+        EXPECT_EQ(num_roots, 3);
+        EXPECT_NEAR(roots[0], 1., 1e-8);
+        EXPECT_NEAR(roots[1], 3., 1e-8);
+        EXPECT_NEAR(roots[2], 2., 1e-8);
+    }
+}
+
+TEST(Core_SolveCubic, regression_27323)
+{
+    {
+        const std::vector<double> coeffs{2e-13, 1, -2, 1};
+        std::vector<double> roots;
+        const auto num_roots = solveCubic(coeffs, roots);
+
+        EXPECT_EQ(num_roots, 1);
+        EXPECT_EQ(roots[0], -5e12 - 2.);
+    }
+
+    {
+        const std::vector<double> coeffs{5e12, -1e13, 5e12};
+        std::vector<double> roots;
+        const auto num_roots = solveCubic(coeffs, roots);
+
+        EXPECT_EQ(num_roots, 1);
+        EXPECT_EQ(roots[0], -5e12 - 2.);
+    }
+}
+
 TEST(Core_SolvePoly, regression_5599)
 {
     // x^4 - x^2 = 0, roots: 1, -1, 0, 0
diff --git a/modules/dnn/include/opencv2/dnn/dnn.hpp b/modules/dnn/include/opencv2/dnn/dnn.hpp
index 43453a4e43..2c3b5ff9e6 100644
--- a/modules/dnn/include/opencv2/dnn/dnn.hpp
+++ b/modules/dnn/include/opencv2/dnn/dnn.hpp
@@ -72,8 +72,8 @@ CV__DNN_INLINE_NS_BEGIN
     {
         //! DNN_BACKEND_DEFAULT equals to OPENCV_DNN_BACKEND_DEFAULT, which can be defined using CMake or a configuration parameter
         DNN_BACKEND_DEFAULT = 0,
-        DNN_BACKEND_INFERENCE_ENGINE = 2,            //!< Intel OpenVINO computational backend
-                                                     //!< @note Tutorial how to build OpenCV with OpenVINO: @ref tutorial_dnn_openvino
+        DNN_BACKEND_INFERENCE_ENGINE = 2,        //!< Intel OpenVINO computational backend, supported targets: CPU, OPENCL, OPENCL_FP16, MYRIAD, HDDL, NPU
+                                                 //!< @note Tutorial how to build OpenCV with OpenVINO: @ref tutorial_dnn_openvino
         DNN_BACKEND_OPENCV,
         DNN_BACKEND_VKCOM,
         DNN_BACKEND_CUDA,
diff --git a/modules/dnn/misc/java/test/DnnListRegressionTest.java b/modules/dnn/misc/java/test/DnnListRegressionTest.java
index 81944d1da9..a456723731 100644
--- a/modules/dnn/misc/java/test/DnnListRegressionTest.java
+++ b/modules/dnn/misc/java/test/DnnListRegressionTest.java
@@ -83,11 +83,9 @@ public class DnnListRegressionTest extends OpenCVTestCase {
     }*/
 
     public void testForward() {
-        List<Mat> outs = new ArrayList();
-        List<String> outNames = new ArrayList();
-        //outNames.add("");
+        Mat out;
         try {
-            net.forward(outs,outNames);
+            out = net.forward();
         } catch(Exception e) {
             fail("Net forward failed: " + e.getMessage());
         }
diff --git a/modules/dnn/src/ie_ngraph.cpp b/modules/dnn/src/ie_ngraph.cpp
index c90dc9dceb..6a30b299eb 100644
--- a/modules/dnn/src/ie_ngraph.cpp
+++ b/modules/dnn/src/ie_ngraph.cpp
@@ -282,6 +282,9 @@ void InfEngineNgraphNet::init(Target targetId)
         case DNN_TARGET_FPGA:
             device_name = "FPGA";
             break;
+        case DNN_TARGET_NPU:
+            device_name = "NPU";
+            break;
         default:
             CV_Error(Error::StsNotImplemented, "Unknown target");
     };
diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp
index 153415bc25..b1e08df9f0 100644
--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@@ -81,7 +81,6 @@ public:
     bool fusedWeights, fusedBias;
     std::vector<double> weightsMultipliers;
     int groups;
-
     BaseConvolutionLayerImpl(const LayerParams &params)
     {
         setParamsFrom(params);
diff --git a/modules/dnn/src/layers/cpu_kernels/softmax.cpp b/modules/dnn/src/layers/cpu_kernels/softmax.cpp
index fd55c1c1de..3b670232cc 100644
--- a/modules/dnn/src/layers/cpu_kernels/softmax.cpp
+++ b/modules/dnn/src/layers/cpu_kernels/softmax.cpp
@@ -37,8 +37,6 @@ void softmax(Mat &dst, const Mat &src, int axis, int axisBias, int axisStep){
 
 #if (CV_SIMD || CV_SIMD_SCALABLE)
     const int nlanes = VTraits<v_float32>::vlanes();
-    // the number of redundant dimension
-    size_t redundantDim = nlanes - axisStep % nlanes;
 #endif
 
     parallel_for_(Range(0, (int) totalTasks), [&](const Range &range) {
@@ -50,61 +48,55 @@ void softmax(Mat &dst, const Mat &src, int axis, int axisBias, int axisStep){
             size_t innerDim = i % innerSize;
             size_t srcOffset = outerDim * outerStep + innerDim;
             // copy data from src to buf along axis, since the data may not be continuous
-            for (size_t cnDim = 0; cnDim < axisStep; cnDim++)
-                axisBuf[cnDim] = srcPtr[srcOffset + (cnDim + axisBias) * cnStep];
+            for (size_t _cnDim = 0; _cnDim < axisStep; _cnDim++)
+                axisBuf[_cnDim] = srcPtr[srcOffset + (_cnDim + axisBias) * cnStep];
 
-            float s = 0.f;
+            float maxVal = -FLT_MAX;
+            int cnDim = 0;
 #if (CV_SIMD || CV_SIMD_SCALABLE)
-            // make the value of the redundant dimension to be -FLT_MAX
-            if (redundantDim != nlanes) {
-                for (size_t j = axisStep; j < axisStep + redundantDim; j++)
-                    axisBuf[j] = -FLT_MAX;
-            }
             // calculate the max value along the axis
-            v_float32 vmax = vx_load(axisBuf);
-            for (size_t cnDim = nlanes; cnDim < axisStep; cnDim += nlanes) {
+            v_float32 vmax = vx_setall_f32(-FLT_MAX);
+            for (; cnDim < axisStep; cnDim += nlanes) {
+                if (cnDim > axisStep - nlanes) {
+                    if (cnDim == 0) { break; }
+                    cnDim = axisStep - nlanes;
+                }
                 v_float32 val = vx_load(axisBuf + cnDim);
                 vmax = v_max(vmax, val);
             }
-            float maxVal = v_reduce_max(vmax);
+            maxVal = v_reduce_max(vmax);
+#endif
+            for (; cnDim < axisStep; cnDim++) {
+                maxVal = std::max(maxVal, axisBuf[cnDim]);
+            }
 
+            float s = 0.f;
+            cnDim = 0;
+#if (CV_SIMD || CV_SIMD_SCALABLE)
             // calculate the exp value along the axis
             v_float32 vs = vx_setzero_f32();
             vmax = vx_setall_f32(maxVal);
-            v_float32 val;
             // calculate and sum all data along axis
-            for (size_t cnDim = 0; cnDim < axisStep; cnDim += nlanes) {
-                val = vx_load(axisBuf + cnDim);
+            for (; cnDim <= axisStep - nlanes; cnDim += nlanes) {
+                // cannot apply halide trick here due to axisBuf is constantly updated
+                v_float32 val = vx_load(axisBuf + cnDim);
                 val = v_sub(val, vmax);
                 val = v_exp(val);
-
                 vs = v_add(vs, val);
                 v_store(axisBuf + cnDim, val);
             }
-
             s = v_reduce_sum(vs);
-            // subtract the value of the redundant dimension
-            if (redundantDim != nlanes) {
-                float _val[VTraits<v_float32>::max_nlanes];
-                v_store(_val, val);
-                for (size_t j = nlanes - redundantDim; j < nlanes; j++)
-                    s -= _val[j];
-            }
-#else
-            float maxVal = axisBuf[0];
-            for (size_t cnDim = 1; cnDim < axisStep; cnDim++) {
-                maxVal = std::max(maxVal, axisBuf[cnDim]);
-            }
-            for (size_t j = 0; j < axisStep; j++) {
-                axisBuf[j] = expf(axisBuf[j] - maxVal);
-                s += axisBuf[j];
-            }
 #endif
+            for (; cnDim < axisStep; cnDim++) {
+                axisBuf[cnDim] = expf(axisBuf[cnDim] - maxVal);
+                s += axisBuf[cnDim];
+            }
+
             s = 1.f / s;
 
             // copy back the result to src
-            for (size_t cnDim = 0; cnDim < axisStep; cnDim++)
-                dstPtr[srcOffset + (cnDim + axisBias) * cnStep] = axisBuf[cnDim] * s;
+            for (size_t _cnDim = 0; _cnDim < axisStep; _cnDim++)
+                dstPtr[srcOffset + (_cnDim + axisBias) * cnStep] = axisBuf[_cnDim] * s;
         }
     }, nstripes);
 }
diff --git a/modules/dnn/src/layers/elementwise_layers.cpp b/modules/dnn/src/layers/elementwise_layers.cpp
index 6ebf7e7867..1830d6f7b0 100644
--- a/modules/dnn/src/layers/elementwise_layers.cpp
+++ b/modules/dnn/src/layers/elementwise_layers.cpp
@@ -224,8 +224,8 @@ public:
         {
             const Mat &src = inputs[i];
             Mat &dst = outputs[i];
-            CV_Assert(src.size == dst.size && src.type() == dst.type() &&
-                      src.isContinuous() && dst.isContinuous() && src.type() == CV_32F);
+            CV_Assert_N(src.size == dst.size, src.type() == dst.type(),
+                      src.isContinuous(), dst.isContinuous(), src.type() == CV_32F);
 
             const int nstripes = getNumThreads();
             PBody body(func, src, dst, nstripes);
diff --git a/modules/dnn/src/layers/eltwise_layer.cpp b/modules/dnn/src/layers/eltwise_layer.cpp
index e42242a9cf..51a94de3cc 100644
--- a/modules/dnn/src/layers/eltwise_layer.cpp
+++ b/modules/dnn/src/layers/eltwise_layer.cpp
@@ -105,7 +105,7 @@ public:
         if (params.has("operation"))
         {
             String operation = toLowerCase(params.get<String>("operation"));
-            if (operation == "prod")
+            if (operation == "prod" || operation == "mul")
                 op = PROD;
             else if (operation == "sum")
                 op = SUM;
diff --git a/modules/dnn/src/layers/recurrent_layers.cpp b/modules/dnn/src/layers/recurrent_layers.cpp
index 7448511816..ad45a8a2a9 100644
--- a/modules/dnn/src/layers/recurrent_layers.cpp
+++ b/modules/dnn/src/layers/recurrent_layers.cpp
@@ -405,7 +405,7 @@ public:
             //swap axis 0 and 1 input x
             cv::Mat tmp;
             // Since python input is 4 dimentional and C++ input 3 dimentinal
-            // we need to proccess each differently
+            // we need to process each differently
             if (input[0].dims == 4){
                 // here !!!
                 CV_Assert(input[0].size[3] == 1);
diff --git a/modules/dnn/src/layers/slice_layer.cpp b/modules/dnn/src/layers/slice_layer.cpp
index e34cffb936..db6e4b2469 100644
--- a/modules/dnn/src/layers/slice_layer.cpp
+++ b/modules/dnn/src/layers/slice_layer.cpp
@@ -228,7 +228,7 @@ public:
     {
 #ifdef HAVE_INF_ENGINE
         if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
-            return sliceRanges.size() == 1 && !hasSteps && neg_step_dims.empty();
+            return sliceRanges.size() == 1 && neg_step_dims.empty();
 #endif
 #ifdef HAVE_CUDA
         if (backendId == DNN_BACKEND_CUDA)
@@ -792,19 +792,23 @@ public:
         auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
         CV_Assert(finalSliceRanges[0].size() == ieInpNode.get_shape().size());
 
-        std::vector<int64_t> offsets, dims;
+        std::vector<int64_t> offsets, dims, steps;
         for (int i = 0; i < finalSliceRanges[0].size(); ++i)
         {
             offsets.push_back(finalSliceRanges[0][i].start);
             dims.push_back(finalSliceRanges[0][i].end);
         }
+        if (hasSteps)
+            steps = std::vector<int64_t>(sliceSteps[0].begin(), sliceSteps[0].end());
+        else
+            steps = std::vector<int64_t>((int64_t)dims.size(), 1);
 
         auto lower_bounds = std::make_shared<ov::op::v0::Constant>(ov::element::i64,
                                              ov::Shape{offsets.size()}, offsets.data());
         auto upper_bounds = std::make_shared<ov::op::v0::Constant>(ov::element::i64,
                                              ov::Shape{dims.size()}, dims.data());
         auto strides = std::make_shared<ov::op::v0::Constant>(ov::element::i64,
-                                        ov::Shape{dims.size()}, std::vector<int64_t>((int64_t)dims.size(), 1));
+                                        ov::Shape{dims.size()}, steps);
 
         auto slice = std::make_shared<ov::op::v1::StridedSlice>(ieInpNode,
                                       lower_bounds, upper_bounds, strides, std::vector<int64_t>{}, std::vector<int64_t>{});
diff --git a/modules/dnn/src/legacy_backend.hpp b/modules/dnn/src/legacy_backend.hpp
index afa94d76aa..d8ad88cb72 100644
--- a/modules/dnn/src/legacy_backend.hpp
+++ b/modules/dnn/src/legacy_backend.hpp
@@ -213,6 +213,7 @@ public:
             {
                 reuse(bestBlobPin, lp);
                 dst = bestBlob.reshape(1, 1).colRange(0, targetTotal).reshape(1, shape);
+                dst.dims = shape.size();
                 return;
             }
         }
diff --git a/modules/dnn/src/net_impl.cpp b/modules/dnn/src/net_impl.cpp
index 2c2f8ee78c..a4b9a1d04a 100644
--- a/modules/dnn/src/net_impl.cpp
+++ b/modules/dnn/src/net_impl.cpp
@@ -1142,6 +1142,8 @@ void Net::Impl::forward(OutputArrayOfArrays outputBlobs,
         const std::vector<String>& outBlobNames)
 {
     CV_Assert(!empty());
+    if (outBlobNames.empty())
+        CV_Error(Error::StsBadArg, "in Net::forward(), outBlobNames cannot be empty");
     FPDenormalsIgnoreHintScope fp_denormals_ignore_scope;
 
     if (mainGraph) {
@@ -1176,6 +1178,8 @@ void Net::Impl::forward(std::vector<std::vector<Mat>>& outputBlobs,
         const std::vector<String>& outBlobNames)
 {
     CV_Assert(!empty());
+    if (outBlobNames.empty())
+        CV_Error(Error::StsBadArg, "in Net::forward(), outBlobNames cannot be empty");
     FPDenormalsIgnoreHintScope fp_denormals_ignore_scope;
 
     if (mainGraph)
@@ -1467,6 +1471,9 @@ void Net::Impl::updateLayersShapes()
 
 LayerPin Net::Impl::getLatestLayerPin(const std::vector<LayerPin>& pins) const
 {
+    if (pins.empty())
+        CV_Error(Error::StsBadArg,
+                 "Cannot Net::Impl::getLatestLayerPin() from empty vector of pins");
     return *std::max_element(pins.begin(), pins.end());
 }
 
diff --git a/modules/dnn/src/net_impl_fuse.cpp b/modules/dnn/src/net_impl_fuse.cpp
index 57afa92315..19767d3185 100644
--- a/modules/dnn/src/net_impl_fuse.cpp
+++ b/modules/dnn/src/net_impl_fuse.cpp
@@ -370,6 +370,7 @@ void Net::Impl::fuseLayers(const std::vector<LayerPin>& blobsToKeep_)
                 Ptr<NaryEltwiseLayer> nextNaryEltwiseLayer = nextData->layerInstance.dynamicCast<NaryEltwiseLayer>();
                 if (nextEltwiseLayer.empty() && nextNaryEltwiseLayer.empty())
                     break;
+                LayerData *naryOrEltwiseData = nextData;
 
                 // TODO: fused the Conv+NaryEltwise on OpenCL backend. At present, we can only support it at CUDA backend.
                 if (IS_DNN_OPENCL_TARGET(preferableTarget) && nextNaryEltwiseLayer)
@@ -605,8 +606,17 @@ void Net::Impl::fuseLayers(const std::vector<LayerPin>& blobsToKeep_)
                             else if (fuse_eltwise) // conv + eltwise/naryEltwise (note: conv could have fused activations before eltwise)
                             {
                                 CV_Assert(IS_DNN_CUDA_TARGET(preferableTarget));
-                                CV_Assert_N(biasLayerData->outputBlobsWrappers.size() == 1, ld.inputBlobsWrappers.size() == 1);
-                                ld.inputBlobsWrappers.push_back(biasLayerData->outputBlobsWrappers[0]);
+                                CV_Assert_N(biasLayerData->outputBlobsWrappers.size() >= 1, ld.inputBlobsWrappers.size() == 1);
+                                // Iterate over eltwise inputs to find exact output id
+                                for (const auto& pin : naryOrEltwiseData->inputBlobsId)
+                                {
+                                    if (pin.lid == biasLayerData->id)
+                                    {
+                                        ld.inputBlobsWrappers.push_back(biasLayerData->outputBlobsWrappers[pin.oid]);
+                                        break;
+                                    }
+                                }
+                                CV_Assert(ld.inputBlobsWrappers.size() == 2);  // Check input was found
 
                                 if (nextEltwiseLayer)
                                     printf_(("\tfused with %s\n", nextEltwiseLayer->name.c_str()));
diff --git a/modules/dnn/src/net_openvino.cpp b/modules/dnn/src/net_openvino.cpp
index 9b18308c96..5b99d94b38 100644
--- a/modules/dnn/src/net_openvino.cpp
+++ b/modules/dnn/src/net_openvino.cpp
@@ -125,7 +125,8 @@ public:
               preferableTarget == DNN_TARGET_OPENCL_FP16 ||
               preferableTarget == DNN_TARGET_MYRIAD ||
               preferableTarget == DNN_TARGET_HDDL ||
-              preferableTarget == DNN_TARGET_FPGA,
+              preferableTarget == DNN_TARGET_FPGA ||
+              preferableTarget == DNN_TARGET_NPU,
               "Unknown OpenVINO target"
         );
     }
diff --git a/modules/dnn/src/op_inf_engine.cpp b/modules/dnn/src/op_inf_engine.cpp
index 8ff1ea6511..a13b7a2e74 100644
--- a/modules/dnn/src/op_inf_engine.cpp
+++ b/modules/dnn/src/op_inf_engine.cpp
@@ -279,6 +279,8 @@ bool checkTarget(Target target)
             return true;
         else if (std::string::npos != i->find("GPU") && (target == DNN_TARGET_OPENCL || target == DNN_TARGET_OPENCL_FP16))
             return true;
+        else if (std::string::npos != i->find("NPU") && target == DNN_TARGET_NPU)
+            return true;
     }
     return false;
 }
diff --git a/modules/dnn/src/tensorflow/tf_importer.cpp b/modules/dnn/src/tensorflow/tf_importer.cpp
index 6d609171b7..166d6b841e 100644
--- a/modules/dnn/src/tensorflow/tf_importer.cpp
+++ b/modules/dnn/src/tensorflow/tf_importer.cpp
@@ -1742,9 +1742,6 @@ void TFImporter::parseStridedSlice(tensorflow::GraphDef& net, const tensorflow::
     {
         if (end_mask & (1 << i))
             ends.at<int>(i) = INT_MAX;
-        if (strides.at<int>(i) != 1)
-            CV_Error(Error::StsNotImplemented,
-                     format("StridedSlice with stride %d", strides.at<int>(i)));
     }
     if (begins.total() == 4 && getDataLayout(name, data_layouts) == DNN_LAYOUT_NHWC)
     {
@@ -1753,9 +1750,12 @@ void TFImporter::parseStridedSlice(tensorflow::GraphDef& net, const tensorflow::
         std::swap(begins.at<int>(1), begins.at<int>(2));
         std::swap(ends.at<int>(2), ends.at<int>(3));
         std::swap(ends.at<int>(1), ends.at<int>(2));
+        std::swap(strides.at<int>(2), strides.at<int>(3));
+        std::swap(strides.at<int>(1), strides.at<int>(2));
     }
     layerParams.set("begin", DictValue::arrayInt((int*)begins.data, begins.total()));
     layerParams.set("end", DictValue::arrayInt((int*)ends.data, ends.total()));
+    layerParams.set("steps", DictValue::arrayInt((int*)strides.data, strides.total()));
 
     Pin inp = parsePin(layer.input(0));
     if (value_id.find(inp.name) != value_id.end())
@@ -2571,7 +2571,7 @@ void TFImporter::parsePReLU(tensorflow::GraphDef& net, const tensorflow::NodeDef
     layerParams.blobs.resize(1);
 
     if (scales.dims == 3) {
-        // Considering scales from Keras wih HWC layout;
+        // Considering scales from Keras with HWC layout;
         transposeND(scales, {2, 0, 1}, layerParams.blobs[0]);
     } else {
         layerParams.blobs[0] = scales;
diff --git a/modules/dnn/src/tflite/tflite_importer.cpp b/modules/dnn/src/tflite/tflite_importer.cpp
index 6e5a90b795..e44dd66927 100644
--- a/modules/dnn/src/tflite/tflite_importer.cpp
+++ b/modules/dnn/src/tflite/tflite_importer.cpp
@@ -71,11 +71,12 @@ private:
     void parseDetectionPostProcess(const Operator& op, const std::string& opcode, LayerParams& layerParams);
     void parseActivation(const Operator& op, const std::string& opcode, LayerParams& layerParams);
     void parseSplit(const Operator& op, const std::string& opcode, LayerParams& layerParams);
+    void parseStridedSlice(const Operator& op, const std::string& opcode, LayerParams& layerParams);
     void parseFullyConnected(const Operator& op, const std::string& opcode, LayerParams& layerParams);
     void parseSoftmax(const Operator& op, const std::string& opcode, LayerParams& layerParams);
     void parseCast(const Operator& op, const std::string& opcode, LayerParams& layerParams);
     void parseTranspose(const Operator& op, const std::string& opcode, LayerParams& layerParams);
-    void parseGlobalPooling(const Operator& op, const std::string& opcode, LayerParams& layerParams);
+    void parseReduce(const Operator& op, const std::string& opcode, LayerParams& layerParams);
 
     void parseFusedActivation(const Operator& op, ActivationFunctionType activ);
     void parseActivation(const Operator& op, const std::string& opcode, LayerParams& layerParams, bool isFused);
@@ -85,6 +86,7 @@ private:
     int addReshapeLayer(const std::vector<int>& shape, int axis, int num_axes,
                         const std::string& name, const std::pair<int, int>& inpId, int dtype, int inpTensorId);
     int addFlattenLayer(int axis, int end_axis, const std::string& name, const std::pair<int, int>& inpId, int dtype, int outTensorId);
+    int addConstLayer(const Mat& data, const std::string& name);
 
     inline bool isInt8(const Operator& op);
     inline void getQuantParams(const Operator& op, float& inpScale, int& inpZero, float& outScale, int& outZero);
@@ -92,9 +94,12 @@ private:
 
 Mat TFLiteImporter::parseTensor(const Tensor& tensor)
 {
+    std::vector<int> shape;
     const auto tensor_shape = tensor.shape();
-    CV_Assert(tensor_shape);
-    std::vector<int> shape(tensor_shape->begin(), tensor_shape->end());
+    if (tensor_shape && tensor_shape->size())
+        shape.assign(tensor_shape->begin(), tensor_shape->end());
+    else
+        shape.resize(1, 1);
     int bufferIdx = tensor.buffer();
     CV_Assert(bufferIdx != 0);  // 0th buffer is a no-data buffer
     const Buffer* buffer = model->buffers()->Get(bufferIdx);
@@ -122,7 +127,11 @@ Mat TFLiteImporter::parseTensor(const Tensor& tensor)
     default:
         CV_Error(Error::StsNotImplemented, format("Parse tensor with type %s", EnumNameTensorType(tensor.type())));
     }
-    return shape.empty() ? Mat() : Mat(shape, dtype, const_cast<void*>(data));
+    Mat res = Mat(shape, dtype, const_cast<void*>(data));
+    // workaround for scalars support
+    if (!tensor_shape || shape.size() == 1)
+        res.dims = 1;
+    return res;
 }
 
 TFLiteImporter::TFLiteImporter(Net& dstNet, const char* modelBuffer, size_t bufSize, bool newEngine)
@@ -277,6 +286,8 @@ void TFLiteImporter::populateNet()
                     // Dequantize a buffer
                     Mat dataFP32;
                     data.convertTo(dataFP32, CV_32F);
+                    // workaround for scalars support
+                    dataFP32.dims = data.dims;
                     allTensors[op_outputs->Get(0)] = dataFP32;
                     continue;
                 }
@@ -321,7 +332,9 @@ TFLiteImporter::DispatchMap TFLiteImporter::buildDispatchMap()
 
     dispatch["CONV_2D"] = &TFLiteImporter::parseConvolution;
     dispatch["DEPTHWISE_CONV_2D"] = &TFLiteImporter::parseDWConvolution;
-    dispatch["ADD"] = dispatch["MUL"] = &TFLiteImporter::parseEltwise;
+    dispatch["ADD"] = dispatch["MUL"] = dispatch["SUB"] =
+        dispatch["SQRT"] = dispatch["DIV"] = dispatch["NEG"] =
+        dispatch["RSQRT"] = dispatch["SQUARED_DIFFERENCE"] = &TFLiteImporter::parseEltwise;
     dispatch["RELU"] = dispatch["PRELU"] = dispatch["HARD_SWISH"] =
         dispatch["LOGISTIC"] = dispatch["LEAKY_RELU"] = &TFLiteImporter::parseActivation;
     dispatch["MAX_POOL_2D"] = dispatch["AVERAGE_POOL_2D"] = &TFLiteImporter::parsePooling;
@@ -341,7 +354,8 @@ TFLiteImporter::DispatchMap TFLiteImporter::buildDispatchMap()
     dispatch["CAST"] = &TFLiteImporter::parseCast;
     dispatch["TFLite_Detection_PostProcess"] = &TFLiteImporter::parseDetectionPostProcess;
     dispatch["TRANSPOSE"] = &TFLiteImporter::parseTranspose;
-    dispatch["MEAN"] = dispatch["REDUCE_MAX"] = &TFLiteImporter::parseGlobalPooling;
+    dispatch["STRIDED_SLICE"] = &TFLiteImporter::parseStridedSlice;
+    dispatch["REDUCE_MAX"] = dispatch["MEAN"] = dispatch["SUM"] = &TFLiteImporter::parseReduce;
     return dispatch;
 }
 
@@ -473,6 +487,14 @@ void TFLiteImporter::addLayer(LayerParams& layerParams, const std::vector<std::s
 void TFLiteImporter::parseConvolution(const Operator& op, const std::string& opcode, LayerParams& layerParams) {
     layerParams.type = "Convolution";
 
+    int inpId = op.inputs()->Get(0);
+    if (layouts[inpId] == DNN_LAYOUT_UNKNOWN && modelTensors->Get(inpId)->shape()->size() == 4)
+    {
+        int permId = addPermuteLayer({0, 3, 1, 2}, layerParams.name + "/permute_input", layerIds[inpId], isInt8(op) ? CV_8S : CV_32F, op.inputs()->Get(0));  // NHWC -> NCHW
+        layerIds[inpId] = std::make_pair(permId, 0);
+        layouts[op.outputs()->Get(0)] = DNN_LAYOUT_NHWC;
+    }
+
     auto options = reinterpret_cast<const Conv2DOptions*>(op.builtin_options());
     layerParams.set("pad_mode", EnumNamePadding(options->padding()));
     layerParams.set("stride_w", options->stride_w());
@@ -622,8 +644,9 @@ void TFLiteImporter::parsePadding(const Operator& op, const std::string& opcode,
 }
 
 void TFLiteImporter::parseEltwise(const Operator& op, const std::string& opcode, LayerParams& layerParams) {
+    bool isOpInt8 = isInt8(op);
     ActivationFunctionType activ = ActivationFunctionType_NONE;
-    layerParams.type = "Eltwise";
+    layerParams.type = isOpInt8 ? "Eltwise" : "NaryEltwise";
     if (opcode == "ADD") {
         auto options = reinterpret_cast<const AddOptions*>(op.builtin_options());
         activ = options->fused_activation_function();
@@ -632,12 +655,35 @@ void TFLiteImporter::parseEltwise(const Operator& op, const std::string& opcode,
     else if (opcode == "MUL") {
         auto options = reinterpret_cast<const MulOptions*>(op.builtin_options());
         activ = options->fused_activation_function();
-        layerParams.set("operation", "prod");
+        layerParams.set("operation", "mul");
+    }
+    else if (opcode == "DIV") {
+        auto options = reinterpret_cast<const DivOptions*>(op.builtin_options());
+        activ = options->fused_activation_function();
+        layerParams.set("operation", "div");
+    }
+    else if (opcode == "SUB" && !isOpInt8) {
+        auto options = reinterpret_cast<const SubOptions*>(op.builtin_options());
+        activ = options->fused_activation_function();
+        layerParams.set("operation", "sub");
+    }
+    else if (opcode == "NEG") {
+        layerParams.type = "Scale";
+        layerParams.blobs.resize(1, Mat(1, 1, CV_32F, Scalar(-1)));
+    }
+    else if (opcode == "SQUARED_DIFFERENCE" && !isOpInt8) {
+        layerParams.set("operation", "sub");
+    }
+    else if (opcode == "RSQRT" && !isOpInt8) {
+        layerParams.type = "Sqrt";
+    }
+    else if (opcode == "SQRT" && !isOpInt8) {
+        layerParams.type = "Sqrt";
     } else {
-        CV_Error(Error::StsNotImplemented, "Unknown opcode for Eltwise layer: " + opcode);
+        CV_Error(Error::StsNotImplemented, cv::format("DNN/TFLite: Unknown opcode for %s Eltwise layer '%s'", isOpInt8 ? "INT8" : "FP32", opcode.c_str()));
     }
 
-    if (isInt8(op)) {
+    if (isOpInt8) {
         const Tensor* out = modelTensors->Get(op.outputs()->Get(0));
         float outScale = out->quantization()->scale()->Get(0);
         int outZero = out->quantization()->zero_point()->Get(0);
@@ -665,10 +711,37 @@ void TFLiteImporter::parseEltwise(const Operator& op, const std::string& opcode,
         layerParams.set("zeropoints", outZero);
     }
 
+    // Force all inputs to be in graph, not as blobs
+    for (int idx : *op.inputs()) {
+        if (layerIds.find(idx) != layerIds.end()) {
+            continue;  // Output from a different layer
+        }
+        Mat blob = allTensors[idx];
+        if (layouts[op.inputs()->Get(0)] == DNN_LAYOUT_NHWC && blob.dims == 1) {
+            blob = blob.reshape(1, {1, (int)blob.total(), 1, 1});
+        }
+        int constId = addConstLayer(blob, modelTensors->Get(idx)->name()->str());
+        layerIds[idx] = std::make_pair(constId, 0);
+    }
+
+
     std::string fusedActivationType = EnumNameActivationFunctionType(activ);
     bool haveFusedActivation = fusedActivationType != "NONE";
     addLayer(layerParams, op, false, haveFusedActivation);
     parseFusedActivation(op, activ);
+
+    // Layers that split on multiple operations
+    if (opcode == "SQUARED_DIFFERENCE") {
+        LayerParams lp;
+        lp.set("power", 2);
+        int id = dstNet.addLayerToPrev(layerParams.name + "/square", "Power", isOpInt8 ? CV_8S : CV_32F, lp);
+        layerIds[op.outputs()->Get(0)] = std::make_pair(id, 0);
+    }
+    else if (opcode == "RSQRT") {
+        LayerParams lp;
+        int id = dstNet.addLayerToPrev(layerParams.name + "/inv", "Reciprocal", isOpInt8 ? CV_8S : CV_32F, lp);
+        layerIds[op.outputs()->Get(0)] = std::make_pair(id, 0);
+    }
 }
 
 void TFLiteImporter::parsePooling(const Operator& op, const std::string& opcode, LayerParams& layerParams) {
@@ -767,15 +840,36 @@ void TFLiteImporter::parseConcat(const Operator& op, const std::string& opcode,
     auto options = reinterpret_cast<const ConcatenationOptions*>(op.builtin_options());
     int axis = options->axis();
 
-    DataLayout inpLayout = layouts[op.inputs()->Get(0)];
-    if (inpLayout == DNN_LAYOUT_NHWC) {
-        // OpenCV works in NCHW data layout. So change the axis correspondingly.
-        axis = normalize_axis(axis, 4);
-        static const int remap[] = {0, 2, 3, 1};
-        axis = remap[axis];
+    bool hasNHWCInput = false;
+    for (int idx : *op.inputs()) {
+        DataLayout inpLayout = layouts[idx];
+        if (inpLayout == DNN_LAYOUT_NHWC) {
+            // OpenCV works in NCHW data layout. So change the axis correspondingly.
+            axis = normalize_axis(axis, 4);
+            static const int remap[] = {0, 2, 3, 1};
+            axis = remap[axis];
+            hasNHWCInput = true;
+            break;
+        }
     }
     layerParams.set("axis", axis);
 
+    // Force all inputs to be in graph, not as blobs
+    for (int idx : *op.inputs()) {
+        if (layerIds.find(idx) != layerIds.end()) {
+            continue;  // Output from a different layer
+        }
+        Mat blob = allTensors[idx];
+        if (hasNHWCInput && blob.dims == 4)
+        {
+            Mat nchwBlob;
+            transposeND(blob, {0, 3, 1, 2}, nchwBlob);
+            blob = nchwBlob;
+        }
+        int constId = addConstLayer(blob, modelTensors->Get(idx)->name()->str());
+        layerIds[idx] = std::make_pair(constId, 0);
+    }
+
     std::string fusedActivationType = EnumNameActivationFunctionType(options->fused_activation_function());
     bool haveFusedActivation = fusedActivationType != "NONE";
     addLayer(layerParams, op, false, haveFusedActivation);
@@ -886,35 +980,38 @@ void TFLiteImporter::parseTranspose(const Operator& op, const std::string& opcod
     addLayer(layerParams, op);
 }
 
-void TFLiteImporter::parseGlobalPooling(const Operator& op, const std::string& opcode, LayerParams& layerParams)
+void TFLiteImporter::parseReduce(const Operator& op, const std::string& opcode, LayerParams& layerParams)
 {
-    layerParams.type = "Pooling";
-    if(opcode == "MEAN") {
-        layerParams.set("pool", "ave");
+    layerParams.type = "Reduce";
+    if (opcode == "REDUCE_MAX") {
+        layerParams.set("reduce", "max");
     }
-    else if (opcode == "REDUCE_MAX") {
-        layerParams.set("pool", "max");
+    else if (opcode == "SUM") {
+        layerParams.set("reduce", "sum");
+    }
+    else if (opcode == "MEAN") {
+        layerParams.set("reduce", "mean");
     }
     else {
-        CV_Error(Error::StsNotImplemented, "Unsupported pooling " + opcode);
+        CV_Error(Error::StsNotImplemented, "Unsupported reducing " + opcode);
     }
-    layerParams.set("global_pooling", true);
     auto options = op.builtin_options_as_ReducerOptions();
-    bool keep_dims = options->keep_dims();
+    layerParams.set("keepdims", options->keep_dims());
 
-    if (!keep_dims) {
-        const auto name = layerParams.name;
-        layerParams.name += "/global_pooling";
-        addLayer(layerParams, op, false, true);
+    Mat axes = allTensors[op.inputs()->Get(1)].clone();
+    CV_CheckTypeEQ(axes.type(), CV_32S, "");
 
-        int out = op.outputs()->Get(0);
-        auto outId = layerIds[out];
-        int flattenId = addFlattenLayer(1, -1, name, outId, isInt8(op) ? CV_8S : CV_32F, out);
-        layerIds[out] = std::make_pair(flattenId, 0);
-    }
-    else {
-        addLayer(layerParams, op);
+    DataLayout inpLayout = layouts[op.inputs()->Get(0)];
+    if (inpLayout == DNN_LAYOUT_NHWC) {
+        static const int remap[] = {0, 2, 3, 1};
+        // OpenCV works in NCHW data layout. So change the axis correspondingly.
+        for (int i = 0; i < axes.total(); ++i) {
+            axes.at<int>(i) = remap[normalize_axis(axes.at<int>(i), 4)];
+        }
     }
+
+    layerParams.set("axes", DictValue::arrayInt(axes.ptr<int>(), axes.total()));
+    addLayer(layerParams, op);
 }
 
 int TFLiteImporter::addPermuteLayer(const std::vector<int>& order, const std::string& permName,
@@ -982,6 +1079,13 @@ int TFLiteImporter::addFlattenLayer(int axis, int end_axis, const std::string& n
     }
 }
 
+int TFLiteImporter::addConstLayer(const Mat& blob, const std::string& name)
+{
+    LayerParams lp;
+    lp.blobs.push_back(blob.u ? blob : blob.clone());  // some tensors are owned by OpenCV
+    return dstNet.addLayer(name, "Const", lp);
+}
+
 void TFLiteImporter::parseDeconvolution(const Operator& op, const std::string& opcode, LayerParams& layerParams) {
     layerParams.type = "Deconvolution";
 
@@ -1070,6 +1174,68 @@ void TFLiteImporter::parseSplit(const Operator& op, const std::string& opcode, L
     addLayer(layerParams, op);
 }
 
+void TFLiteImporter::parseStridedSlice(const Operator& op, const std::string& opcode, LayerParams& layerParams) {
+    layerParams.type = "Slice";
+    auto options = op.builtin_options_as_StridedSliceOptions();
+    CV_Assert(options);
+    int endMask = options->end_mask();
+    if (options->new_axis_mask())
+        CV_Error(Error::StsNotImplemented, "New axis during StridedSlice");
+    int shrinkMask = options->shrink_axis_mask();
+
+    Mat begins = allTensors[op.inputs()->Get(1)];
+    Mat ends = allTensors[op.inputs()->Get(2)];
+    Mat strides = allTensors[op.inputs()->Get(3)];
+
+    CV_CheckTypeEQ(begins.type(), CV_32SC1, "");
+    CV_CheckTypeEQ(ends.type(), CV_32SC1, "");
+    CV_CheckTypeEQ(strides.type(), CV_32SC1, "");
+    const int num = begins.total();
+    CV_Assert_N(num == ends.total(), num == strides.total());
+    for (int i = 0; i < num; ++i)
+    {
+        if (endMask & (1 << i))
+            ends.at<int>(i) = INT_MAX;
+    }
+    if (begins.total() == 4 && layouts[op.inputs()->Get(0)] == DNN_LAYOUT_NHWC)
+    {
+        // Swap NHWC parameters' order to NCHW.
+        std::swap(begins.at<int>(2), begins.at<int>(3));
+        std::swap(begins.at<int>(1), begins.at<int>(2));
+        std::swap(ends.at<int>(2), ends.at<int>(3));
+        std::swap(ends.at<int>(1), ends.at<int>(2));
+        std::swap(strides.at<int>(2), strides.at<int>(3));
+        std::swap(strides.at<int>(1), strides.at<int>(2));
+    }
+    layerParams.set("begin", DictValue::arrayInt((int*)begins.data, begins.total()));
+    layerParams.set("end", DictValue::arrayInt((int*)ends.data, ends.total()));
+    layerParams.set("steps", DictValue::arrayInt((int*)strides.data, strides.total()));
+
+    int lastShrinkAxis = -1;
+    for (int axis = 0; axis < num; ++axis)
+    {
+        if (shrinkMask & (1 << axis))
+            lastShrinkAxis = axis;
+    }
+    std::string layerName = layerParams.name;
+    if (lastShrinkAxis != -1)
+    {
+        layerParams.name += "/slice";
+    }
+
+    addLayer(layerParams, op);
+
+    for (int axis = 0; axis < num; ++axis)
+    {
+        if (!(shrinkMask & (1 << axis)))
+            continue;
+        std::string name = (axis == lastShrinkAxis) ? layerName : format("%s/shrink_axis_%d", layerName.c_str(), axis);
+        int layerId = addFlattenLayer(axis, axis + 1, name,
+            layerIds[op.outputs()->Get(0)], isInt8(op) ? CV_8S : CV_32F, op.inputs()->Get(0));
+        layerIds[op.inputs()->Get(0)] = std::make_pair(layerId, 0);
+    }
+}
+
 void TFLiteImporter::parseFullyConnected(const Operator& op, const std::string& opcode, LayerParams& layerParams) {
     layerParams.type = "Gemm";
     auto options = op.builtin_options_as_FullyConnectedOptions();
diff --git a/modules/dnn/test/test_onnx_importer.cpp b/modules/dnn/test/test_onnx_importer.cpp
index ec2e923fb8..e9e93ef93a 100644
--- a/modules/dnn/test/test_onnx_importer.cpp
+++ b/modules/dnn/test/test_onnx_importer.cpp
@@ -3295,8 +3295,8 @@ TEST_P(Test_ONNX_nets, ViT_B_32) {
     }
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) {
         if (target == DNN_TARGET_CPU) {
-            l1 = 4.4e-5; // Expected: (normL1) <= (l1), actual: 4.31208e-05 vs 1e-05
-            lInf = 0.0002; // Expected: (normInf) <= (lInf), actual: 0.000194907 vs 0.0001
+            l1 = 6e-5; // Expected: (normL1) <= (l1), actual: 4.31208e-05 vs 1e-05
+            lInf = 0.0003; // Expected: (normInf) <= (lInf), actual: 0.000194907 vs 0.0001
         } else if (target == DNN_TARGET_OPENCL || target == DNN_TARGET_OPENCL_FP16) {
             l1 = 0.0092; // Expected: (normL1) <= (l1), actual: 0.00918349 vs 4.4e-05
             lInf = 0.056; // Expected: (normInf) <= (lInf), actual: 0.0556431 vs 0.0002
diff --git a/modules/dnn/test/test_tflite_importer.cpp b/modules/dnn/test/test_tflite_importer.cpp
index 91715d4a4d..52e5ecef27 100644
--- a/modules/dnn/test/test_tflite_importer.cpp
+++ b/modules/dnn/test/test_tflite_importer.cpp
@@ -57,6 +57,7 @@ void Test_TFLite::testModel(Net& net, const std::string& modelName, const Mat& i
 
     ASSERT_EQ(outs.size(), outNames.size());
     for (int i = 0; i < outNames.size(); ++i) {
+        std::replace(outNames[i].begin(), outNames[i].end(), ':', '_');
         Mat ref = blobFromNPY(findDataFile(format("dnn/tflite/%s_out_%s.npy", modelName.c_str(), outNames[i].c_str())));
         // A workaround solution for the following cases due to inconsistent shape definitions.
         // The details please see: https://github.com/opencv/opencv/pull/25297#issuecomment-2039081369
@@ -278,6 +279,16 @@ TEST_P(Test_TFLite, leakyRelu) {
     testLayer("leakyRelu");
 }
 
+TEST_P(Test_TFLite, StridedSlice) {
+    testLayer("strided_slice");
+}
+
+TEST_P(Test_TFLite, DISABLED_face_blendshapes)
+{
+    Mat inp = blobFromNPY(findDataFile("dnn/tflite/face_blendshapes_inp.npy"));
+    testModel("face_blendshapes", inp);
+}
+
 INSTANTIATE_TEST_CASE_P(/**/, Test_TFLite, dnnBackendsAndTargets());
 
 }}  // namespace
diff --git a/modules/highgui/src/window_QT.cpp b/modules/highgui/src/window_QT.cpp
index dc38882660..269947b525 100644
--- a/modules/highgui/src/window_QT.cpp
+++ b/modules/highgui/src/window_QT.cpp
@@ -1649,16 +1649,18 @@ CvWindow::CvWindow(QString name, int arg2)
         createStatusBar();
     }
 
+    myView->getWidget()->setSizePolicy(QSizePolicy::Expanding, QSizePolicy::Expanding);
+
     //Now attach everything
     if (myToolBar)
-        myGlobalLayout->addWidget(myToolBar, 0, Qt::AlignLeft);
+        myGlobalLayout->addWidget(myToolBar);
 
-    myGlobalLayout->addWidget(myView->getWidget(), 0, Qt::AlignCenter);
+    myGlobalLayout->addWidget(myView->getWidget());
 
     myGlobalLayout->addLayout(myBarLayout);
 
     if (myStatusBar)
-        myGlobalLayout->addWidget(myStatusBar, 0, Qt::AlignLeft);
+        myGlobalLayout->addWidget(myStatusBar);
 
     setLayout(myGlobalLayout);
     show();
diff --git a/modules/imgcodecs/src/grfmt_gif.cpp b/modules/imgcodecs/src/grfmt_gif.cpp
index d4c0099f1f..1954e2d501 100644
--- a/modules/imgcodecs/src/grfmt_gif.cpp
+++ b/modules/imgcodecs/src/grfmt_gif.cpp
@@ -130,7 +130,7 @@ bool GifDecoder::readData(Mat &img) {
                                                   globalColorTable[bgColor * 3 + 1], // G
                                                   globalColorTable[bgColor * 3 + 0], // R
                                                   0);                                // A
-                restore = Mat(width, height, CV_8UC4, background);
+                restore = Mat(height, width, CV_8UC4, background);
             }
             else
             {
diff --git a/modules/imgcodecs/src/grfmt_jpeg2000_openjpeg.cpp b/modules/imgcodecs/src/grfmt_jpeg2000_openjpeg.cpp
index c05c0bca0e..70832277ef 100644
--- a/modules/imgcodecs/src/grfmt_jpeg2000_openjpeg.cpp
+++ b/modules/imgcodecs/src/grfmt_jpeg2000_openjpeg.cpp
@@ -146,7 +146,7 @@ public:
         return (ptr_ - other.ptr_) / step_;
     }
 
-    /* Comparision */
+    /* Comparison */
     bool operator==(const ChannelsIterator<Traits>& other) const CV_NOEXCEPT
     {
         return ptr_ == other.ptr_;
diff --git a/modules/imgcodecs/src/grfmt_png.cpp b/modules/imgcodecs/src/grfmt_png.cpp
index 7febc3153b..c4b5a2c3a6 100644
--- a/modules/imgcodecs/src/grfmt_png.cpp
+++ b/modules/imgcodecs/src/grfmt_png.cpp
@@ -133,6 +133,7 @@ const uint32_t id_bKGD = 0x624B4744; // The bKGD chunk specifies a default backg
 const uint32_t id_tRNS = 0x74524E53; // The tRNS chunk provides transparency information
 const uint32_t id_tEXt = 0x74455874; // The tEXt chunk stores metadata as text in key-value pairs
 const uint32_t id_IEND = 0x49454E44; // end/footer chunk
+const uint32_t id_CgBI = 0x43674249; // The CgBI chunk (Apple private) is not supported.
 
 APNGFrame::APNGFrame()
 {
@@ -285,9 +286,18 @@ bool  PngDecoder::readHeader()
     if (!readFromStreamOrBuffer(&sig, 8))
         return false;
 
+    // IHDR chunk shall be first. ( https://www.w3.org/TR/png-3/#5ChunkOrdering )
     id = read_chunk(m_chunkIHDR);
-    if (id != id_IHDR)
+    if (id == id_CgBI)
+    {
+        CV_LOG_ERROR(NULL, "CgBI chunk (Apple private) found as the first chunk. IHDR is expected.");
         return false;
+    }
+    if (id != id_IHDR)
+    {
+        CV_LOG_ERROR(NULL, "IHDR chunk shall be first. This data may be broken or malformed.");
+        return false;
+    }
 
     m_is_fcTL_loaded = false;
     while (true)
diff --git a/modules/imgcodecs/test/test_png.cpp b/modules/imgcodecs/test/test_png.cpp
index 95b0bc0793..f271950a5b 100644
--- a/modules/imgcodecs/test/test_png.cpp
+++ b/modules/imgcodecs/test/test_png.cpp
@@ -110,6 +110,44 @@ TEST(Imgcodecs_Png, read_color_palette_with_alpha)
     EXPECT_EQ(img.at<Vec3b>(0, 1), Vec3b(255, 0, 0));
 }
 
+// IHDR shall be first.
+// See https://github.com/opencv/opencv/issues/27295
+TEST(Imgcodecs_Png, decode_regression27295)
+{
+    vector<uchar> buff;
+    Mat src = Mat::zeros(240, 180, CV_8UC3);
+    vector<int> param;
+    EXPECT_NO_THROW(imencode(".png", src, buff, param));
+
+    Mat img;
+
+    // If IHDR chunk found as the first chunk, output shall not be empty.
+    // 8 means PNG signature length.
+    // 4 means length field(uint32_t).
+    EXPECT_EQ(buff[8+4+0], 'I');
+    EXPECT_EQ(buff[8+4+1], 'H');
+    EXPECT_EQ(buff[8+4+2], 'D');
+    EXPECT_EQ(buff[8+4+3], 'R');
+    EXPECT_NO_THROW(img = imdecode(buff, IMREAD_COLOR));
+    EXPECT_FALSE(img.empty());
+
+    // If Non-IHDR chunk found as the first chunk, output shall be empty.
+    buff[8+4+0] = 'i'; // Not 'I'
+    buff[8+4+1] = 'H';
+    buff[8+4+2] = 'D';
+    buff[8+4+3] = 'R';
+    EXPECT_NO_THROW(img = imdecode(buff, IMREAD_COLOR));
+    EXPECT_TRUE(img.empty());
+
+    // If CgBI chunk (Apple private) found as the first chunk, output shall be empty with special message.
+    buff[8+4+0] = 'C';
+    buff[8+4+1] = 'g';
+    buff[8+4+2] = 'B';
+    buff[8+4+3] = 'I';
+    EXPECT_NO_THROW(img = imdecode(buff, IMREAD_COLOR));
+    EXPECT_TRUE(img.empty());
+}
+
 typedef testing::TestWithParam<string> Imgcodecs_Png_PngSuite;
 
 TEST_P(Imgcodecs_Png_PngSuite, decode)
diff --git a/modules/imgproc/CMakeLists.txt b/modules/imgproc/CMakeLists.txt
index 09d04832e4..ceca2be80e 100644
--- a/modules/imgproc/CMakeLists.txt
+++ b/modules/imgproc/CMakeLists.txt
@@ -6,7 +6,7 @@ ocv_add_dispatched_file(filter SSE2 SSE4_1 AVX2)
 ocv_add_dispatched_file(color_hsv SSE2 SSE4_1 AVX2)
 ocv_add_dispatched_file(color_rgb SSE2 SSE4_1 AVX2)
 ocv_add_dispatched_file(color_yuv SSE2 SSE4_1 AVX2)
-ocv_add_dispatched_file(median_blur SSE2 SSE4_1 AVX2)
+ocv_add_dispatched_file(median_blur SSE2 SSE4_1 AVX2 AVX512_SKX)
 ocv_add_dispatched_file(morph SSE2 SSE4_1 AVX2)
 ocv_add_dispatched_file(smooth SSE2 SSE4_1 AVX2)
 ocv_add_dispatched_file(sumpixels SSE2 AVX2 AVX512_SKX)
diff --git a/modules/imgproc/src/accum.simd.hpp b/modules/imgproc/src/accum.simd.hpp
index 1336302613..59420ede73 100644
--- a/modules/imgproc/src/accum.simd.hpp
+++ b/modules/imgproc/src/accum.simd.hpp
@@ -2825,7 +2825,7 @@ void accW_simd_(const ushort* src, float* dst, const uchar* mask, int len, int c
                 v_expand(v_src1, v_src10, v_src11);
                 v_expand(v_src2, v_src20, v_src21);
 
-                v_float32 v_dst00, v_dst01, v_dst02, v_dst10, v_dst11, v_dst20, v_dst21;
+                v_float32 v_dst00, v_dst01, v_dst10, v_dst11, v_dst20, v_dst21;
                 v_load_deinterleave(dst + x * cn             , v_dst00, v_dst10, v_dst20);
                 v_load_deinterleave(dst + (x +     step) * cn, v_dst01, v_dst11, v_dst21);
 
diff --git a/modules/imgproc/src/bilateral_filter.dispatch.cpp b/modules/imgproc/src/bilateral_filter.dispatch.cpp
index 4ccec12496..7b992303b6 100644
--- a/modules/imgproc/src/bilateral_filter.dispatch.cpp
+++ b/modules/imgproc/src/bilateral_filter.dispatch.cpp
@@ -75,10 +75,12 @@ static bool ocl_bilateralFilter_8u(InputArray _src, OutputArray _dst, int d,
     if (depth != CV_8U || cn > 4)
         return false;
 
-    if (sigma_color <= 0)
-        sigma_color = 1;
-    if (sigma_space <= 0)
-        sigma_space = 1;
+    constexpr double eps = 1e-6;
+    if( sigma_color <= eps || sigma_space <= eps )
+    {
+        _src.copyTo(_dst);
+        return true;
+    }
 
     double gauss_color_coeff = -0.5 / (sigma_color * sigma_color);
     double gauss_space_coeff = -0.5 / (sigma_space * sigma_space);
@@ -165,10 +167,12 @@ bilateralFilter_8u( const Mat& src, Mat& dst, int d,
 
     CV_Assert( (src.type() == CV_8UC1 || src.type() == CV_8UC3) && src.data != dst.data );
 
-    if( sigma_color <= 0 )
-        sigma_color = 1;
-    if( sigma_space <= 0 )
-        sigma_space = 1;
+    constexpr double eps = 1e-6;
+    if( sigma_color <= eps || sigma_space <= eps )
+    {
+        src.copyTo(dst);
+        return;
+    }
 
     double gauss_color_coeff = -0.5/(sigma_color*sigma_color);
     double gauss_space_coeff = -0.5/(sigma_space*sigma_space);
@@ -232,10 +236,12 @@ bilateralFilter_32f( const Mat& src, Mat& dst, int d,
 
     CV_Assert( (src.type() == CV_32FC1 || src.type() == CV_32FC3) && src.data != dst.data );
 
-    if( sigma_color <= 0 )
-        sigma_color = 1;
-    if( sigma_space <= 0 )
-        sigma_space = 1;
+    constexpr double eps = 1e-6;
+    if( sigma_color <= eps || sigma_space <= eps )
+    {
+        src.copyTo(dst);
+        return;
+    }
 
     double gauss_color_coeff = -0.5/(sigma_color*sigma_color);
     double gauss_space_coeff = -0.5/(sigma_space*sigma_space);
@@ -358,9 +364,16 @@ static bool ipp_bilateralFilter(Mat &src, Mat &dst, int d, double sigmaColor, do
 #ifdef HAVE_IPP_IW
     CV_INSTRUMENT_REGION_IPP();
 
+    constexpr double eps = 1e-6;
+    if( sigmaColor <= eps || sigmaSpace <= eps )
+    {
+        src.copyTo(dst);
+        return true;
+    }
+
     int         radius         = IPP_MAX(((d <= 0)?cvRound(sigmaSpace*1.5):d/2), 1);
-    Ipp32f      valSquareSigma = (Ipp32f)((sigmaColor <= 0)?1:sigmaColor*sigmaColor);
-    Ipp32f      posSquareSigma = (Ipp32f)((sigmaSpace <= 0)?1:sigmaSpace*sigmaSpace);
+    Ipp32f      valSquareSigma = (Ipp32f)(sigmaColor*sigmaColor);
+    Ipp32f      posSquareSigma = (Ipp32f)(sigmaSpace*sigmaSpace);
 
     // Acquire data and begin processing
     try
diff --git a/modules/imgproc/src/color_hsv.dispatch.cpp b/modules/imgproc/src/color_hsv.dispatch.cpp
index 2d3dbf74bd..db7aa6fe84 100644
--- a/modules/imgproc/src/color_hsv.dispatch.cpp
+++ b/modules/imgproc/src/color_hsv.dispatch.cpp
@@ -257,6 +257,41 @@ bool oclCvtColorBGR2HLS( InputArray _src, OutputArray _dst, int bidx, bool full
     return h.run();
 }
 
+static UMat init_sdiv_table()
+{
+    cv::Mat sdiv_mat(1, 256, CV_32SC1);
+    int* sdiv = sdiv_mat.ptr<int>();
+
+    const int hsv_shift = 12;
+    const int v = 255 << hsv_shift;
+
+    sdiv[0] = 0;
+    for(int i = 1; i < 256; i++ )
+        sdiv[i] = saturate_cast<int>(v/(1.*i));
+
+    cv::UMat result;
+    sdiv_mat.copyTo(result);
+    return result;
+}
+
+static UMat init_hdiv_table(int hrange)
+{
+    cv::Mat hdiv_mat(1, 256, CV_32SC1);
+    int* hdiv = hdiv_mat.ptr<int>();
+
+    const int hsv_shift = 12;
+    const int v = hrange << hsv_shift;
+
+    hdiv[0] = 0;
+    for (int i = 1; i < 256; i++ )
+        hdiv[i] = saturate_cast<int>(v/(6.*i));
+
+    cv::UMat result;
+    hdiv_mat.copyTo(result);
+    return result;
+
+}
+
 bool oclCvtColorBGR2HSV( InputArray _src, OutputArray _dst, int bidx, bool full )
 {
     OclHelper< Set<3, 4>, Set<3>, Set<CV_8U, CV_32F> > h(_src, _dst, 3);
@@ -274,41 +309,22 @@ bool oclCvtColorBGR2HSV( InputArray _src, OutputArray _dst, int bidx, bool full
 
     if(_src.depth() == CV_8U)
     {
-        static UMat sdiv_data;
-        static UMat hdiv_data180;
-        static UMat hdiv_data256;
-        static int sdiv_table[256];
-        static int hdiv_table180[256];
-        static int hdiv_table256[256];
-        static volatile bool initialized180 = false, initialized256 = false;
-        volatile bool & initialized = hrange == 180 ? initialized180 : initialized256;
+        static UMat sdiv_data = init_sdiv_table();
+        UMat hdiv_data;
 
-        if (!initialized)
+        if (hrange == 180)
         {
-            int * const hdiv_table = hrange == 180 ? hdiv_table180 : hdiv_table256, hsv_shift = 12;
-            UMat & hdiv_data = hrange == 180 ? hdiv_data180 : hdiv_data256;
-
-            sdiv_table[0] = hdiv_table180[0] = hdiv_table256[0] = 0;
-
-            int v = 255 << hsv_shift;
-            if (!initialized180 && !initialized256)
-            {
-                for(int i = 1; i < 256; i++ )
-                    sdiv_table[i] = saturate_cast<int>(v/(1.*i));
-                Mat(1, 256, CV_32SC1, sdiv_table).copyTo(sdiv_data);
-            }
-
-            v = hrange << hsv_shift;
-            for (int i = 1; i < 256; i++ )
-                hdiv_table[i] = saturate_cast<int>(v/(6.*i));
-
-            Mat(1, 256, CV_32SC1, hdiv_table).copyTo(hdiv_data);
-            initialized = true;
+            static UMat hdiv_data180 = init_hdiv_table(180);
+            hdiv_data = hdiv_data180;
+        }
+        else
+        {
+            static UMat hdiv_data256 = init_hdiv_table(256);
+            hdiv_data = hdiv_data256;
         }
 
         h.setArg(ocl::KernelArg::PtrReadOnly(sdiv_data));
-        h.setArg(hrange == 256 ? ocl::KernelArg::PtrReadOnly(hdiv_data256) :
-                                 ocl::KernelArg::PtrReadOnly(hdiv_data180));
+        h.setArg(ocl::KernelArg::PtrReadOnly(hdiv_data));
     }
 
     return h.run();
diff --git a/modules/imgproc/src/color_hsv.simd.hpp b/modules/imgproc/src/color_hsv.simd.hpp
index c450d609e5..8ae663dff4 100644
--- a/modules/imgproc/src/color_hsv.simd.hpp
+++ b/modules/imgproc/src/color_hsv.simd.hpp
@@ -850,7 +850,7 @@ struct RGB2HLS_b
                 for ( ; j <= dn*bufChannels - nBlock*bufChannels;
                       j += nBlock*bufChannels, src += nBlock*4)
                 {
-                    v_uint8 rgb0, rgb1, rgb2, rgb3, dummy;
+                    v_uint8 rgb0, rgb1, rgb2, dummy;
                     v_load_deinterleave(src, rgb0, rgb1, rgb2, dummy);
 
                     v_uint16 d0,d1,d2,d3,d4,d5;
diff --git a/modules/imgproc/src/hal_replacement.hpp b/modules/imgproc/src/hal_replacement.hpp
index 645e9557ed..8bc20fbefd 100644
--- a/modules/imgproc/src/hal_replacement.hpp
+++ b/modules/imgproc/src/hal_replacement.hpp
@@ -1395,6 +1395,26 @@ inline int hal_ni_polygonMoments(const uchar* src_data, size_t src_size, int src
 #define cv_hal_polygonMoments hal_ni_polygonMoments
 //! @endcond
 
+/**
+   @brief Calculates a histogram of a set of arrays
+   @param src_data Source imgage data
+   @param src_step Source image step
+   @param src_type Source image type
+   @param src_width Source image width
+   @param src_height Source image height
+   @param hist_data Histogram data
+   @param hist_size Histogram size
+   @param ranges Array of dims arrays of the histogram bin boundaries
+   @param uniform Flag indicating whether the histogram is uniform or not
+   @param accumulate Accumulation flag
+*/
+inline int hal_ni_calcHist(const uchar* src_data, size_t src_step, int src_type, int src_width, int src_height, float* hist_data, int hist_size, const float** ranges, bool uniform, bool accumulate)
+{ return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+
+//! @cond IGNORED
+#define cv_hal_calcHist hal_ni_calcHist
+//! @endcond
+
 //! @}
 
 #if defined(__clang__)
diff --git a/modules/imgproc/src/histogram.cpp b/modules/imgproc/src/histogram.cpp
index 62a718e827..5ca5d58437 100644
--- a/modules/imgproc/src/histogram.cpp
+++ b/modules/imgproc/src/histogram.cpp
@@ -910,6 +910,11 @@ void cv::calcHist( const Mat* images, int nimages, const int* channels,
             && _mask.empty() && images[0].dims <= 2 && ranges && ranges[0],
         ipp_calchist(images[0], hist, histSize[0], ranges, uniform, accumulate));
 
+    if (nimages == 1 && dims == 1 && channels && channels[0] == 0 && _mask.empty() && images[0].dims <= 2 && ranges && ranges[0]) {
+        CALL_HAL(calcHist, cv_hal_calcHist, images[0].data, images[0].step, images[0].type(), images[0].cols, images[0].rows,
+                                            hist.ptr<float>(), histSize[0], ranges, uniform, accumulate);
+    }
+
     Mat ihist = hist;
     ihist.flags = (ihist.flags & ~CV_MAT_TYPE_MASK)|CV_32S;
 
@@ -1986,6 +1991,46 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method )
 
         if( (method == cv::HISTCMP_CHISQR) || (method == cv::HISTCMP_CHISQR_ALT))
         {
+#if CV_SIMD_64F || CV_SIMD_SCALABLE_64F
+            v_float64 v_eps = vx_setall_f64(DBL_EPSILON);
+            v_float64 v_one = vx_setall_f64(1.f);
+            v_float64 v_zero = vx_setzero_f64();
+            v_float64 v_res = vx_setzero_f64();
+            for ( ; j <= len - VTraits<v_float32>::vlanes(); j += VTraits<v_float32>::vlanes())
+            {
+                v_float32 v_h1 = vx_load(h1 + j), v_h2 = vx_load(h2 + j);
+                v_float64 v_h1_l = v_cvt_f64(v_h1), v_h1_h = v_cvt_f64_high(v_h1);
+                v_float64 v_h2_l = v_cvt_f64(v_h2), v_h2_h = v_cvt_f64_high(v_h2);
+
+                v_float64 v_a_l, v_a_h;
+                v_a_l = v_sub(v_h1_l, v_h2_l);
+                v_a_h = v_sub(v_h1_h, v_h2_h);
+
+                v_float64 v_b_l, v_b_h;
+                if (method == cv::HISTCMP_CHISQR)
+                {
+                    v_b_l = v_h1_l;
+                    v_b_h = v_h1_h;
+                }
+                else
+                {
+                    v_b_l = v_add(v_h1_l, v_h2_l);
+                    v_b_h = v_add(v_h1_h, v_h2_h);
+                }
+
+                // low part
+                auto v_res_l = v_mul(v_mul(v_a_l, v_a_l), v_div(v_one, v_b_l));
+                auto mask = v_gt(v_abs(v_b_l), v_eps);
+                v_res_l = v_select(mask, v_res_l, v_zero);
+                v_res = v_add(v_res, v_res_l);
+                // high part
+                auto v_res_h = v_mul(v_mul(v_a_h, v_a_h), v_div(v_one, v_b_h));
+                mask = v_gt(v_abs(v_b_h), v_eps);
+                v_res_h = v_select(mask, v_res_h, v_zero);
+                v_res = v_add(v_res, v_res_h);
+            }
+            result += v_reduce_sum(v_res);
+#endif
             for( ; j < len; j++ )
             {
                 double a = h1[j] - h2[j];
diff --git a/modules/imgproc/src/median_blur.simd.hpp b/modules/imgproc/src/median_blur.simd.hpp
index 7cc0aa693c..a3e7a101bb 100644
--- a/modules/imgproc/src/median_blur.simd.hpp
+++ b/modules/imgproc/src/median_blur.simd.hpp
@@ -13,6 +13,7 @@
 // Copyright (C) 2000-2008, 2018, Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
 // Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
+// Copyright (C) 2025, Advanced Micro Devices, all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
@@ -693,8 +694,16 @@ medianBlur_SortNet( const Mat& _src, Mat& _dst, int m )
 #else
                 int nlanes = 1;
 #endif
-                for( ; j <= size.width - nlanes - cn; j += nlanes )
+                for (; j < size.width - cn; j += nlanes)
                 {
+                    //handling tail in vectorized path itself
+                    if ( j > size.width - cn - nlanes ) {
+                        if (j == cn || src == dst) {
+                            break;
+                        }
+                        j = size.width - cn - nlanes;
+                    }
+
                     VT p0 = vop.load(row0+j-cn), p1 = vop.load(row0+j), p2 = vop.load(row0+j+cn);
                     VT p3 = vop.load(row1+j-cn), p4 = vop.load(row1+j), p5 = vop.load(row1+j+cn);
                     VT p6 = vop.load(row2+j-cn), p7 = vop.load(row2+j), p8 = vop.load(row2+j+cn);
@@ -705,6 +714,7 @@ medianBlur_SortNet( const Mat& _src, Mat& _dst, int m )
                     vop(p3, p6); vop(p1, p4); vop(p2, p5); vop(p4, p7);
                     vop(p4, p2); vop(p6, p4); vop(p4, p2);
                     vop.store(dst+j, p4);
+
                 }
 
                 limit = size.width;
@@ -798,8 +808,14 @@ medianBlur_SortNet( const Mat& _src, Mat& _dst, int m )
 #else
                 int nlanes = 1;
 #endif
-                for( ; j <= size.width - nlanes - cn*2; j += nlanes )
+                for( ; j < size.width - cn*2; j += nlanes)
                 {
+                    if ( j > size.width - cn*2 - nlanes ) {
+                        if (j == cn*2 || src == dst) {
+                            break;
+                        }
+                        j = size.width - cn*2 - nlanes;
+                    }
                     VT p0 = vop.load(row[0]+j-cn*2), p5 = vop.load(row[1]+j-cn*2), p10 = vop.load(row[2]+j-cn*2), p15 = vop.load(row[3]+j-cn*2), p20 = vop.load(row[4]+j-cn*2);
                     VT p1 = vop.load(row[0]+j-cn*1), p6 = vop.load(row[1]+j-cn*1), p11 = vop.load(row[2]+j-cn*1), p16 = vop.load(row[3]+j-cn*1), p21 = vop.load(row[4]+j-cn*1);
                     VT p2 = vop.load(row[0]+j-cn*0), p7 = vop.load(row[1]+j-cn*0), p12 = vop.load(row[2]+j-cn*0), p17 = vop.load(row[3]+j-cn*0), p22 = vop.load(row[4]+j-cn*0);
@@ -830,6 +846,7 @@ medianBlur_SortNet( const Mat& _src, Mat& _dst, int m )
                     vop(p13, p17); vop(p3, p15); vop(p11, p23); vop(p11, p15); vop(p7, p19);
                     vop(p7, p11); vop(p11, p13); vop(p11, p12);
                     vop.store(dst+j, p12);
+
                 }
 
                 limit = size.width;
diff --git a/modules/imgproc/src/pyramids.cpp b/modules/imgproc/src/pyramids.cpp
index de20098542..531b36b774 100644
--- a/modules/imgproc/src/pyramids.cpp
+++ b/modules/imgproc/src/pyramids.cpp
@@ -769,7 +769,6 @@ template <> int PyrUpVecVOneRow<int, uchar>(int** src, uchar* dst, int width)
             r20 = *(row2 + x);
         int _2r10 = r10 + r10;
         int d = r00 + r20 + (_2r10 + _2r10 + _2r10);
-        int d_shifted = (r10 + r20) << 2;
         // Similar to v_rshr_pack_u<6>(d, vx_setzero_s16()).get0()
         *(dst + x) = (int)((((unsigned int)d) + ((1 << (6 - 1)))) >> 6);
     }
diff --git a/modules/imgproc/test/test_bilateral_filter.cpp b/modules/imgproc/test/test_bilateral_filter.cpp
index 0f92836808..8f800a1215 100644
--- a/modules/imgproc/test/test_bilateral_filter.cpp
+++ b/modules/imgproc/test/test_bilateral_filter.cpp
@@ -110,10 +110,12 @@ namespace opencv_test { namespace {
             src.type() == dst.type() && src.size() == dst.size() &&
             src.data != dst.data );
 
-        if( sigma_color <= 0 )
-            sigma_color = 1;
-        if( sigma_space <= 0 )
-            sigma_space = 1;
+        constexpr double eps = 1e-6;
+        if( sigma_color <= eps || sigma_space <= eps )
+        {
+            src.copyTo(dst);
+            return;
+        }
 
         double gauss_color_coeff = -0.5/(sigma_color*sigma_color);
         double gauss_space_coeff = -0.5/(sigma_space*sigma_space);
diff --git a/modules/imgproc/test/test_drawing.cpp b/modules/imgproc/test/test_drawing.cpp
index 8189a5e14d..12e10a1396 100755
--- a/modules/imgproc/test/test_drawing.cpp
+++ b/modules/imgproc/test/test_drawing.cpp
@@ -966,7 +966,7 @@ TEST(Drawing, fillpoly_fully)
         cv::Mat labelImage(binary.size(), CV_32S);
         cv::Mat labelCentroids;
         int labels = cv::connectedComponents(binary, labelImage, 4);
-        EXPECT_EQ(2, labels) << "artifacts occured";
+        EXPECT_EQ(2, labels) << "artifacts occurred";
     }
 
     // check if filling went over border
@@ -1055,7 +1055,7 @@ PARAM_TEST_CASE(FillPolyFully, unsigned, unsigned, int, int, Point, cv::LineType
         cv::Mat labelImage(binary.size(), CV_32S);
         cv::Mat labelCentroids;
         int labels = cv::connectedComponents(binary, labelImage, 4);
-        EXPECT_EQ(2, labels) << "artifacts occured";
+        EXPECT_EQ(2, labels) << "artifacts occurred";
     }
 
     void check_filling_over_border(cv::Mat& img, const std::vector<cv::Point>& polygonPoints)
diff --git a/modules/java/generator/CMakeLists.txt b/modules/java/generator/CMakeLists.txt
index b8ae34023b..130e6d5fec 100644
--- a/modules/java/generator/CMakeLists.txt
+++ b/modules/java/generator/CMakeLists.txt
@@ -56,6 +56,12 @@ foreach(m ${OPENCV_JAVA_MODULES})
   ocv_remap_files(misc_files)
 endforeach(m)
 
+include("${OpenCV_SOURCE_DIR}/cmake/OpenCVBindingsPreprocessorDefinitions.cmake")
+ocv_bindings_generator_populate_preprocessor_definitions(
+  OPENCV_MODULES_BUILD
+  opencv_preprocessor_defs
+)
+
 set(CONFIG_FILE "${CMAKE_CURRENT_BINARY_DIR}/gen_java.json")
 set(__config_str
 "{
@@ -63,6 +69,9 @@ set(__config_str
   \"modules\": [
 ${__modules_config}
   ],
+  \"preprocessor_definitions\": {
+${opencv_preprocessor_defs}
+  },
   \"files_remap\": [
 ${__remap_config}
   ]
diff --git a/modules/java/generator/gen_java.py b/modules/java/generator/gen_java.py
index b69841c35a..f6e85ea99a 100755
--- a/modules/java/generator/gen_java.py
+++ b/modules/java/generator/gen_java.py
@@ -65,6 +65,7 @@ type_dict = {
     "char"    : { "j_type" : "char", "jn_type" : "char", "jni_type" : "jchar", "suffix" : "C" },
     "int"     : { "j_type" : "int", "jn_type" : "int", "jni_type" : "jint", "suffix" : "I" },
     "long"    : { "j_type" : "int", "jn_type" : "int", "jni_type" : "jint", "suffix" : "I" },
+    "long long" : { "j_type" : "long", "jn_type" : "long", "jni_type" : "jlong", "suffix" : "J" },
     "float"   : { "j_type" : "float", "jn_type" : "float", "jni_type" : "jfloat", "suffix" : "F" },
     "double"  : { "j_type" : "double", "jn_type" : "double", "jni_type" : "jdouble", "suffix" : "D" },
     "size_t"  : { "j_type" : "long", "jn_type" : "long", "jni_type" : "jlong", "suffix" : "J" },
@@ -89,6 +90,13 @@ type_dict = {
         'v_type': 'string',
         'j_import': 'java.lang.String'
     },
+    "byte[]": {
+        "j_type" : "byte[]",
+        "jn_type": "byte[]",
+        "jni_type": "jbyteArray",
+        "jni_name": "n_%(n)s",
+        "jni_var": "char* n_%(n)s = reinterpret_cast<char*>(env->GetByteArrayElements(%(n)s, NULL))",
+    },
 }
 
 # Defines a rule to add extra prefixes for names from specific namespaces.
@@ -523,14 +531,14 @@ class JavaWrapperGenerator(object):
 
         if classinfo.base:
             classinfo.addImports(classinfo.base)
-        type_dict.setdefault("Ptr_"+name, {}).update(
-            { "j_type" : classinfo.jname,
-              "jn_type" : "long", "jn_args" : (("__int64", ".getNativeObjAddr()"),),
-              "jni_name" : "*((Ptr<"+classinfo.fullNameCPP()+">*)%(n)s_nativeObj)", "jni_type" : "jlong",
-              "suffix" : "J",
-              "j_import" : "org.opencv.%s.%s" % (self.module, classinfo.jname)
+        if ("Ptr_"+name) not in type_dict:
+            type_dict["Ptr_"+name] = {
+                "j_type" : classinfo.jname,
+                "jn_type" : "long", "jn_args" : (("__int64", ".getNativeObjAddr()"),),
+                "jni_name" : "*((Ptr<"+classinfo.fullNameCPP()+">*)%(n)s_nativeObj)", "jni_type" : "jlong",
+                "suffix" : "J",
+                "j_import" : "org.opencv.%s.%s" % (self.module, classinfo.jname)
             }
-        )
         logging.info('ok: class %s, name: %s, base: %s', classinfo, name, classinfo.base)
 
     def add_const(self, decl, enumType=None): # [ "const cname", val, [], [] ]
@@ -595,12 +603,16 @@ class JavaWrapperGenerator(object):
             f.write(buf)
         updated_files += 1
 
-    def gen(self, srcfiles, module, output_path, output_jni_path, output_java_path, common_headers):
+    def gen(self, srcfiles, module, output_path, output_jni_path, output_java_path, common_headers,
+            preprocessor_definitions=None):
         self.clear()
         self.module = module
         self.Module = module.capitalize()
         # TODO: support UMat versions of declarations (implement UMat-wrapper for Java)
-        parser = hdr_parser.CppHeaderParser(generate_umat_decls=False)
+        parser = hdr_parser.CppHeaderParser(
+            generate_umat_decls=False,
+            preprocessor_definitions=preprocessor_definitions
+        )
 
         self.add_class( ['class cv.' + self.Module, '', [], []] ) # [ 'class/struct cname', ':bases', [modlist] [props] ]
 
@@ -1450,6 +1462,7 @@ if __name__ == "__main__":
     gen_dict_files = []
 
     print("JAVA: Processing OpenCV modules: %d" % len(config['modules']))
+    preprocessor_definitions = config.get('preprocessor_definitions', None)
     for e in config['modules']:
         (module, module_location) = (e['name'], os.path.join(ROOT_DIR, e['location']))
         logging.info("\n=== MODULE: %s (%s) ===\n" % (module, module_location))
@@ -1514,7 +1527,8 @@ if __name__ == "__main__":
             copy_java_files(java_test_files_dir, java_test_base_path, 'org/opencv/test/' + module)
 
         if len(srcfiles) > 0:
-            generator.gen(srcfiles, module, dstdir, jni_path, java_path, common_headers)
+            generator.gen(srcfiles, module, dstdir, jni_path, java_path, common_headers,
+                          preprocessor_definitions)
         else:
             logging.info("No generated code for module: %s", module)
     generator.finalize(jni_path)
diff --git a/modules/js/generator/CMakeLists.txt b/modules/js/generator/CMakeLists.txt
index c66608e917..48e3e2f92e 100644
--- a/modules/js/generator/CMakeLists.txt
+++ b/modules/js/generator/CMakeLists.txt
@@ -18,6 +18,7 @@ endforeach(m)
 
 # header blacklist
 ocv_list_filterout(opencv_hdrs "modules/.*.h$")
+ocv_list_filterout(opencv_hdrs "modules/core/include/opencv2/core/fast_math.hpp")
 ocv_list_filterout(opencv_hdrs "modules/core/.*/cuda")
 ocv_list_filterout(opencv_hdrs "modules/core/.*/opencl")
 ocv_list_filterout(opencv_hdrs "modules/core/include/opencv2/core/opengl.hpp")
@@ -30,7 +31,14 @@ ocv_list_filterout(opencv_hdrs "modules/core/include/opencv2/core/utils/*.privat
 ocv_list_filterout(opencv_hdrs "modules/core/include/opencv2/core/utils/instrumentation.hpp")
 ocv_list_filterout(opencv_hdrs "modules/core/include/opencv2/core/utils/trace*")
 
-ocv_update_file("${CMAKE_CURRENT_BINARY_DIR}/headers.txt" "${opencv_hdrs}")
+set(config_json_headers_list "")
+foreach(header IN LISTS opencv_hdrs)
+  if(NOT config_json_headers_list STREQUAL "")
+    set(config_json_headers_list "${config_json_headers_list},\n\"${header}\"")
+  else()
+    set(config_json_headers_list "\"${header}\"")
+  endif()
+endforeach()
 
 set(bindings_cpp "${OPENCV_JS_BINDINGS_DIR}/gen/bindings.cpp")
 
@@ -55,16 +63,42 @@ else()
   message(STATUS "Use autogenerated whitelist ${OPENCV_JS_WHITELIST_FILE}")
 endif()
 
+include("${OpenCV_SOURCE_DIR}/cmake/OpenCVBindingsPreprocessorDefinitions.cmake")
+ocv_bindings_generator_populate_preprocessor_definitions(
+  OPENCV_MODULES_BUILD
+  opencv_preprocessor_defs
+)
+
+set(__config_str
+"{
+  \"headers\": [
+${config_json_headers_list}
+  ],
+  \"preprocessor_definitions\": {
+${opencv_preprocessor_defs}
+  },
+  \"core_bindings_file_path\": \"${JS_SOURCE_DIR}/src/core_bindings.cpp\"
+}")
+set(JSON_CONFIG_FILE_PATH "${CMAKE_CURRENT_BINARY_DIR}/gen_js_config.json")
+if(EXISTS "${JSON_CONFIG_FILE_PATH}")
+  file(READ "${JSON_CONFIG_FILE_PATH}" __content)
+else()
+  set(__content "")
+endif()
+if(NOT "${__content}" STREQUAL "${__config_str}")
+  file(WRITE "${JSON_CONFIG_FILE_PATH}" "${__config_str}")
+endif()
+unset(__config_str)
+
 add_custom_command(
   OUTPUT ${bindings_cpp} "${OPENCV_DEPHELPER}/gen_opencv_js_source"
   COMMAND
       ${PYTHON_DEFAULT_EXECUTABLE}
       "${CMAKE_CURRENT_SOURCE_DIR}/embindgen.py"
-      "${scripts_hdr_parser}"
-      "${bindings_cpp}"
-      "${CMAKE_CURRENT_BINARY_DIR}/headers.txt"
-      "${JS_SOURCE_DIR}/src/core_bindings.cpp"
-      "${OPENCV_JS_WHITELIST_FILE}"
+      --parser "${scripts_hdr_parser}"
+      --output_file "${bindings_cpp}"
+      --config "${JSON_CONFIG_FILE_PATH}"
+      --whitelist "${OPENCV_JS_WHITELIST_FILE}"
   COMMAND
       ${CMAKE_COMMAND} -E touch "${OPENCV_DEPHELPER}/gen_opencv_js_source"
   WORKING_DIRECTORY
@@ -73,6 +107,7 @@ add_custom_command(
       ${JS_SOURCE_DIR}/src/core_bindings.cpp
       ${CMAKE_CURRENT_SOURCE_DIR}/embindgen.py
       ${CMAKE_CURRENT_SOURCE_DIR}/templates.py
+      ${JSON_CONFIG_FILE_PATH}
       "${OPENCV_JS_WHITELIST_FILE}"
       ${scripts_hdr_parser}
       #(not needed - generated by CMake) ${CMAKE_CURRENT_BINARY_DIR}/headers.txt
diff --git a/modules/js/generator/embindgen.py b/modules/js/generator/embindgen.py
index 8352893133..d5d600e83f 100644
--- a/modules/js/generator/embindgen.py
+++ b/modules/js/generator/embindgen.py
@@ -319,7 +319,7 @@ class Namespace(object):
 
 
 class JSWrapperGenerator(object):
-    def __init__(self):
+    def __init__(self, preprocessor_definitions=None):
 
         self.bindings = []
         self.wrapper_funcs = []
@@ -328,7 +328,9 @@ class JSWrapperGenerator(object):
         self.namespaces = {}
         self.enums = {}  # FIXIT 'enums' should belong to 'namespaces'
 
-        self.parser = hdr_parser.CppHeaderParser()
+        self.parser = hdr_parser.CppHeaderParser(
+            preprocessor_definitions=preprocessor_definitions
+        )
         self.class_idx = 0
 
     def add_class(self, stype, name, decl):
@@ -962,41 +964,69 @@ class JSWrapperGenerator(object):
 
 
 if __name__ == "__main__":
-    if len(sys.argv) < 5:
-        print("Usage:\n", \
-            os.path.basename(sys.argv[0]), \
-            "<full path to hdr_parser.py> <bindings.cpp> <headers.txt> <core_bindings.cpp> <whitelist.json or opencv_js.config.py>")
-        print("Current args are: ", ", ".join(["'"+a+"'" for a in sys.argv]))
-        exit(1)
+    import argparse
 
-    dstdir = "."
-    hdr_parser_path = os.path.abspath(sys.argv[1])
+    arg_parser = argparse.ArgumentParser(
+        description="OpenCV JavaScript bindings generator"
+    )
+    arg_parser.add_argument(
+        "-p", "--parser",
+        required=True,
+        help="Full path to OpenCV header parser `hdr_parser.py`"
+    )
+    arg_parser.add_argument(
+        "-o", "--output_file",
+        dest="output_file_path",
+        required=True,
+        help="Path to output file containing js bindings"
+    )
+    arg_parser.add_argument(
+        "-c", "--config",
+        dest="config_json_path",
+        required=True,
+        help="Path to generator configuration file in .json format"
+    )
+    arg_parser.add_argument(
+        "--whitelist",
+        dest="whitelist_file_path",
+        required=True,
+        help="Path to whitelist.js or opencv_js.config.py"
+    )
+    args = arg_parser.parse_args()
+
+    # import header parser
+    hdr_parser_path = os.path.abspath(args.parser)
     if hdr_parser_path.endswith(".py"):
         hdr_parser_path = os.path.dirname(hdr_parser_path)
     sys.path.append(hdr_parser_path)
     import hdr_parser
 
-    bindingsCpp = sys.argv[2]
-    headers = open(sys.argv[3], 'r').read().split(';')
-    coreBindings = sys.argv[4]
-    whiteListFile = sys.argv[5]
+    with open(args.config_json_path, "r") as fh:
+        config_json = json.load(fh)
+    headers = config_json.get("headers", ())
 
-    if whiteListFile.endswith(".json") or whiteListFile.endswith(".JSON"):
-        with open(whiteListFile) as f:
+    bindings_cpp = args.output_file_path
+    core_bindings_path = config_json["core_bindings_file_path"]
+    whitelist_file_path = args.whitelist_file_path
+
+    if whitelist_file_path.endswith(".json") or whitelist_file_path.endswith(".JSON"):
+        with open(whitelist_file_path) as f:
             gen_dict = json.load(f)
-        f.close()
         white_list = makeWhiteListJson(gen_dict)
         namespace_prefix_override = makeNamespacePrefixOverride(gen_dict)
-    elif whiteListFile.endswith(".py") or whiteListFile.endswith(".PY"):
-        exec(open(whiteListFile).read())
-        assert(white_list)
+    elif whitelist_file_path.endswith(".py") or whitelist_file_path.endswith(".PY"):
+        with open(whitelist_file_path) as fh:
+            exec(fh.read())
+        assert white_list
         namespace_prefix_override = {
             'dnn' : '',
             'aruco' : '',
         }
     else:
-        print("Unexpected format of OpenCV config file", whiteListFile)
+        print("Unexpected format of OpenCV config file", whitelist_file_path)
         exit(1)
 
-    generator = JSWrapperGenerator()
-    generator.gen(bindingsCpp, headers, coreBindings)
+    generator = JSWrapperGenerator(
+        preprocessor_definitions=config_json.get("preprocessor_definitions", None)
+    )
+    generator.gen(bindings_cpp, headers, core_bindings_path)
diff --git a/modules/js/src/core_bindings.cpp b/modules/js/src/core_bindings.cpp
index 00e49cd0a7..9f4d4d7e51 100644
--- a/modules/js/src/core_bindings.cpp
+++ b/modules/js/src/core_bindings.cpp
@@ -466,7 +466,8 @@ EMSCRIPTEN_BINDINGS(binding_utils)
     register_vector<double>("DoubleVector");
     register_vector<std::string>("StringVector");
     register_vector<cv::Point>("PointVector");
-    register_vector<cv::Point3f>("Point3fVector");
+    register_vector<cv::Point2f>("Point2fVector");
+    register_vector<cv::Point3_<float>>("Point3fVector");
     register_vector<cv::Mat>("MatVector");
     register_vector<cv::Rect>("RectVector");
     register_vector<cv::KeyPoint>("KeyPointVector");
@@ -612,6 +613,7 @@ EMSCRIPTEN_BINDINGS(binding_utils)
 
     EMSCRIPTEN_CV_RECT(int, "Rect")
     EMSCRIPTEN_CV_RECT(float, "Rect2f")
+    EMSCRIPTEN_CV_RECT(double, "Rect2d")
 
     emscripten::value_object<cv::RotatedRect>("RotatedRect")
         .field("center", &cv::RotatedRect::center)
diff --git a/modules/js/test/test_objdetect.js b/modules/js/test/test_objdetect.js
index 9e064be4f7..fd578cf7e4 100644
--- a/modules/js/test/test_objdetect.js
+++ b/modules/js/test/test_objdetect.js
@@ -197,6 +197,8 @@ QUnit.test('Charuco detector', function (assert) {
         board.generateImage(new cv.Size(300, 500), board_image);
         assert.ok(!board_image.empty());
 
+        let chess_corners = board.getChessboardCorners();
+
         detector.detectBoard(board_image, corners, ids);
         assert.ok(!corners.empty());
         assert.ok(!ids.empty());
@@ -211,5 +213,6 @@ QUnit.test('Charuco detector', function (assert) {
         detector.delete();
         corners.delete();
         ids.delete();
+        chess_corners.delete();
     }
 });
diff --git a/modules/objc/generator/CMakeLists.txt b/modules/objc/generator/CMakeLists.txt
index d33e998142..2602f913f0 100644
--- a/modules/objc/generator/CMakeLists.txt
+++ b/modules/objc/generator/CMakeLists.txt
@@ -38,6 +38,13 @@ if(HAVE_opencv_objc)
   set(__objc_build_dir "\"objc_build_dir\": \"${CMAKE_CURRENT_BINARY_DIR}/../objc\",")
 endif()
 
+include("${OpenCV_SOURCE_DIR}/cmake/OpenCVBindingsPreprocessorDefinitions.cmake")
+
+ocv_bindings_generator_populate_preprocessor_definitions(
+  OPENCV_MODULES_BUILD
+  opencv_preprocessor_defs
+)
+
 set(CONFIG_FILE "${CMAKE_CURRENT_BINARY_DIR}/gen_objc.json")
 set(__config_str
 "{
@@ -45,7 +52,10 @@ set(__config_str
   ${__objc_build_dir}
   \"modules\": [
 ${__modules_config}
-  ]
+  ],
+  \"preprocessor_definitions\": {
+${opencv_preprocessor_defs}
+  }
 }
 ")
 #TODO: ocv_update_file("${CONFIG_FILE}" "${__config_str}" ON_CHANGE_REMOVE "${OPENCV_DEPHELPER}/gen_opencv_objc_source")
diff --git a/modules/objc/generator/gen_objc.py b/modules/objc/generator/gen_objc.py
index 484ca3b9c7..7ab7b74e25 100755
--- a/modules/objc/generator/gen_objc.py
+++ b/modules/objc/generator/gen_objc.py
@@ -895,7 +895,8 @@ class ObjectiveCWrapperGenerator(object):
         namespace = self.classes[cname].namespace if cname in self.classes else "cv"
         return namespace.replace(".", "::") + "::"
 
-    def gen(self, srcfiles, module, output_path, output_objc_path, common_headers, manual_classes):
+    def gen(self, srcfiles, module, output_path, output_objc_path,
+            common_headers, manual_classes, preprocessor_definitions=None):
         self.clear()
         self.module = module
         self.objcmodule = make_objcmodule(module)
@@ -904,7 +905,10 @@ class ObjectiveCWrapperGenerator(object):
         extension_signatures = []
 
         # TODO: support UMat versions of declarations (implement UMat-wrapper for Java)
-        parser = hdr_parser.CppHeaderParser(generate_umat_decls=False)
+        parser = hdr_parser.CppHeaderParser(
+            generate_umat_decls=False,
+            preprocessor_definitions=preprocessor_definitions
+        )
 
         module_ci = self.add_class( ['class ' + self.Module, '', [], []]) # [ 'class/struct cname', ':bases', [modlist] [props] ]
         module_ci.header_import = module + '.hpp'
@@ -1716,7 +1720,9 @@ if __name__ == "__main__":
         manual_classes = [x for x in [x[x.rfind('/')+1:-2] for x in [x for x in copied_files if x.endswith('.h')]] if x in type_dict]
 
         if len(srcfiles) > 0:
-            generator.gen(srcfiles, module, dstdir, objc_base_path, common_headers, manual_classes)
+            generator.gen(srcfiles, module, dstdir, objc_base_path,
+                          common_headers, manual_classes,
+                          config.get("preprocessor_definitions"))
         else:
             logging.info("No generated code for module: %s", module)
     generator.finalize(args.target, objc_base_path, objc_build_dir)
diff --git a/modules/photo/src/hdr_common.cpp b/modules/photo/src/hdr_common.cpp
index 983efe3792..5e4bea9d85 100644
--- a/modules/photo/src/hdr_common.cpp
+++ b/modules/photo/src/hdr_common.cpp
@@ -63,10 +63,12 @@ Mat triangleWeights()
 {
     // hat function
     Mat w(LDR_SIZE, 1, CV_32F);
-    int half = LDR_SIZE / 2;
-    for(int i = 0; i < LDR_SIZE; i++) {
-        w.at<float>(i) = i < half ? i + 1.0f : LDR_SIZE - i;
-    }
+    int half   = LDR_SIZE / 2;
+    int maxVal = LDR_SIZE - 1;
+    for (int i = 0; i < LDR_SIZE; i++)
+        w.at<float>(i) = (i < half)
+            ? static_cast<float>(i)
+            : static_cast<float>(maxVal - i);
     return w;
 }
 
diff --git a/modules/photo/test/test_hdr.cpp b/modules/photo/test/test_hdr.cpp
index a26e83e49e..264a7d7257 100644
--- a/modules/photo/test/test_hdr.cpp
+++ b/modules/photo/test/test_hdr.cpp
@@ -187,11 +187,9 @@ TEST(Photo_MergeDebevec, regression)
     Mat result, expected;
     loadImage(test_path + "merge/debevec.hdr", expected);
     merge->process(images, result, times, response);
-
     Ptr<Tonemap> map = createTonemap();
     map->process(result, result);
     map->process(expected, expected);
-
     checkEqual(expected, result, 1e-2f, "Debevec");
 }
 
@@ -221,16 +219,15 @@ TEST(Photo_CalibrateDebevec, regression)
     loadExposureSeq(test_path + "exposures/", images, times);
     loadResponseCSV(test_path + "calibrate/debevec.csv", expected);
     Ptr<CalibrateDebevec> calibrate = createCalibrateDebevec();
-
     calibrate->process(images, response, times);
     Mat diff = abs(response - expected);
     diff = diff.mul(1.0f / response);
     double max;
     minMaxLoc(diff, NULL, &max);
 #if defined(__arm__) || defined(__aarch64__)
-    ASSERT_LT(max, 0.2);
+    ASSERT_LT(max, 0.25);
 #else
-    ASSERT_LT(max, 0.1);
+    ASSERT_LT(max, 0.15);
 #endif
 }
 
@@ -266,4 +263,46 @@ TEST(Photo_CalibrateRobertson, bug_18180)
     EXPECT_EQ(0.0, cv::norm(response, response_no_nans, NORM_L2));
 }
 
+TEST(Photo_CalibrateDebevec, bug_24966)
+{
+    string test_path = string(cvtest::TS::ptr()->get_data_path()) + "hdr/";
+    vector<Mat> all_images;
+    vector<float> all_times;
+    loadExposureSeq(test_path + "exposures/", all_images, all_times);
+    // Use a balanced subset of exposures
+    vector<int> selected_indices = {1,2,3,4,5};
+    vector<Mat> images;
+    vector<float> times;
+    for (int idx : selected_indices) {
+        images.push_back(all_images[idx]);
+        times.push_back(all_times[idx]);
+    }
+    // Run CRF estimation for different sample points
+    vector<int> sample_points = {200,300,400};
+    vector<Mat> responses;
+    for (int samples : sample_points) {
+        Ptr<CalibrateDebevec> calibrate = createCalibrateDebevec(samples);
+        Mat response;
+        calibrate->process(images, response, times);
+        Mat roi = response.rowRange(15, 240); //Checking CRF only in the middle of the image
+        responses.push_back(roi);
+    }
+
+    // Compare consecutive pairs of CRFs
+    for (size_t i = 0; i < responses.size()-1; ++i) {
+        Mat diff = abs(responses[i] - responses[i+1]);
+        double max_diff;
+        minMaxLoc(diff, nullptr, &max_diff);
+        cout << "max_diff = " << max_diff << endl;
+        #if defined(__aarch64__) && defined(__APPLE__)
+            ASSERT_LT(max_diff, 10) << "CRF instability detected between samples="
+                << sample_points[i] << " and " << sample_points[i+1]
+                << " (max diff = " << max_diff << ")";
+        #else
+            ASSERT_LT(max_diff, 5) << "CRF instability detected between samples="
+                << sample_points[i] << " and " << sample_points[i+1]
+                << " (max diff = " << max_diff << ")";
+        #endif
+    }
+}
 }} // namespace
diff --git a/modules/python/bindings/CMakeLists.txt b/modules/python/bindings/CMakeLists.txt
index 918411864c..c511b9bc80 100644
--- a/modules/python/bindings/CMakeLists.txt
+++ b/modules/python/bindings/CMakeLists.txt
@@ -47,6 +47,7 @@ endforeach(m)
 
 # header blacklist
 ocv_list_filterout(opencv_hdrs "modules/.*\\\\.h$")
+ocv_list_filterout(opencv_hdrs "modules/core/include/opencv2/core/fast_math.hpp")
 ocv_list_filterout(opencv_hdrs "modules/core/.*/cuda/")
 ocv_list_filterout(opencv_hdrs "modules/core/.*/hal/")
 ocv_list_filterout(opencv_hdrs "modules/core/.*/opencl/")
@@ -74,12 +75,50 @@ set(cv2_generated_files
     "${OPENCV_PYTHON_SIGNATURES_FILE}"
 )
 
-string(REPLACE ";" "\n" opencv_hdrs_ "${opencv_hdrs}")
-file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/headers.txt" "${opencv_hdrs_}")
+
+set(config_json_headers_list "")
+foreach(header IN LISTS opencv_hdrs)
+  if(NOT config_json_headers_list STREQUAL "")
+    set(config_json_headers_list "${config_json_headers_list},\n\"${header}\"")
+  else()
+    set(config_json_headers_list "\"${header}\"")
+  endif()
+endforeach()
+
+include("${OpenCV_SOURCE_DIR}/cmake/OpenCVBindingsPreprocessorDefinitions.cmake")
+
+ocv_bindings_generator_populate_preprocessor_definitions(
+  OPENCV_MODULES_BUILD
+  opencv_preprocessor_defs
+)
+
+set(__config_str
+"{
+  \"headers\": [
+${config_json_headers_list}
+  ],
+  \"preprocessor_definitions\": {
+${opencv_preprocessor_defs}
+  }
+}")
+
+set(JSON_CONFIG_FILE_PATH "${CMAKE_CURRENT_BINARY_DIR}/gen_python_config.json")
+if(EXISTS "${JSON_CONFIG_FILE_PATH}")
+  file(READ "${JSON_CONFIG_FILE_PATH}" __content)
+else()
+  set(__content "")
+endif()
+if(NOT "${__content}" STREQUAL "${__config_str}")
+  file(WRITE "${JSON_CONFIG_FILE_PATH}" "${__config_str}")
+endif()
+unset(__config_str)
+
 file(GLOB_RECURSE typing_stubs_generation_files "${PYTHON_SOURCE_DIR}/src2/typing_stubs_generation/*.py")
 add_custom_command(
     OUTPUT ${cv2_generated_files}
-    COMMAND "${PYTHON_DEFAULT_EXECUTABLE}" "${PYTHON_SOURCE_DIR}/src2/gen2.py" "${CMAKE_CURRENT_BINARY_DIR}" "${CMAKE_CURRENT_BINARY_DIR}/headers.txt"
+    COMMAND "${PYTHON_DEFAULT_EXECUTABLE}" "${PYTHON_SOURCE_DIR}/src2/gen2.py"
+        "--config" "${JSON_CONFIG_FILE_PATH}"
+        "--output_dir" "${CMAKE_CURRENT_BINARY_DIR}"
     DEPENDS "${PYTHON_SOURCE_DIR}/src2/gen2.py"
             "${PYTHON_SOURCE_DIR}/src2/hdr_parser.py"
             "${typing_stubs_generation_files}"
diff --git a/modules/python/src2/cv2.cpp b/modules/python/src2/cv2.cpp
index 528e2aba0e..c23cac483d 100644
--- a/modules/python/src2/cv2.cpp
+++ b/modules/python/src2/cv2.cpp
@@ -43,7 +43,9 @@ typedef std::vector<DMatch> vector_DMatch;
 typedef std::vector<String> vector_String;
 typedef std::vector<std::string> vector_string;
 typedef std::vector<Scalar> vector_Scalar;
+#ifdef HAVE_OPENCV_OBJDETECT
 typedef std::vector<aruco::Dictionary> vector_Dictionary;
+#endif // HAVE_OPENCV_OBJDETECT
 
 typedef std::vector<std::vector<char> > vector_vector_char;
 typedef std::vector<std::vector<Point> > vector_vector_Point;
diff --git a/modules/python/src2/cv2_convert.hpp b/modules/python/src2/cv2_convert.hpp
index 979425c3f9..5ef81855ea 100644
--- a/modules/python/src2/cv2_convert.hpp
+++ b/modules/python/src2/cv2_convert.hpp
@@ -182,6 +182,29 @@ struct PyOpenCV_Converter
     }
 };
 
+// There is conflict between "long long" and "int64".
+// They are the same type on some 32-bit platforms.
+template<typename T>
+struct PyOpenCV_Converter
+    < T, typename std::enable_if< std::is_same<long long, T>::value && !std::is_same<long long, int64>::value >::type >
+{
+    static inline PyObject* from(const long long& value)
+    {
+        return PyLong_FromLongLong(value);
+    }
+
+    static inline bool to(PyObject* obj, long long& value, const ArgInfo& info)
+    {
+        CV_UNUSED(info);
+        if(!obj || obj == Py_None)
+            return true;
+        else if(PyLong_Check(obj))
+            value = PyLong_AsLongLong(obj);
+        else
+            return false;
+        return value != (long long)-1 || !PyErr_Occurred();
+    }
+};
 
 // --- uchar
 template<> bool pyopencv_to(PyObject* obj, uchar& value, const ArgInfo& info);
diff --git a/modules/python/src2/gen2.py b/modules/python/src2/gen2.py
index 0dec0ad1a4..e306a3c7bb 100755
--- a/modules/python/src2/gen2.py
+++ b/modules/python/src2/gen2.py
@@ -2,6 +2,7 @@
 
 from __future__ import print_function
 import hdr_parser, sys, re
+import json
 from string import Template
 from pprint import pprint
 from collections import namedtuple
@@ -1109,6 +1110,18 @@ class Namespace(object):
 
 
 class PythonWrapperGenerator(object):
+    class Config:
+        def __init__(self, headers, preprocessor_definitions = None):
+            self.headers = headers
+            if preprocessor_definitions is None:
+                preprocessor_definitions = {}
+            elif not isinstance(preprocessor_definitions, dict):
+                raise TypeError(
+                    "preprocessor_definitions should rather dictionary or None. "
+                    "Got: {}".format(type(preprocessor_definitions).__name__)
+                )
+            self.preprocessor_definitions = preprocessor_definitions
+
     def __init__(self):
         self.clear()
 
@@ -1327,13 +1340,16 @@ class PythonWrapperGenerator(object):
             f.write(buf.getvalue())
 
     def save_json(self, path, name, value):
-        import json
         with open(path + "/" + name, "wt") as f:
             json.dump(value, f)
 
-    def gen(self, srcfiles, output_path):
+    def gen(self, srcfiles, output_path, preprocessor_definitions = None):
         self.clear()
-        self.parser = hdr_parser.CppHeaderParser(generate_umat_decls=True, generate_gpumat_decls=True)
+        self.parser = hdr_parser.CppHeaderParser(
+            generate_umat_decls=True,
+            generate_gpumat_decls=True,
+            preprocessor_definitions=preprocessor_definitions
+        )
 
         # step 1: scan the headers and build more descriptive maps of classes, consts, functions
         for hdr in srcfiles:
@@ -1504,12 +1520,36 @@ class PythonWrapperGenerator(object):
         self.save_json(output_path, "pyopencv_signatures.json", self.py_signatures)
 
 if __name__ == "__main__":
-    srcfiles = hdr_parser.opencv_hdr_list
-    dstdir = "/Users/vp/tmp"
-    if len(sys.argv) > 1:
-        dstdir = sys.argv[1]
-    if len(sys.argv) > 2:
-        with open(sys.argv[2], 'r') as f:
-            srcfiles = [l.strip() for l in f.readlines()]
+    import argparse
+    import tempfile
+
+    arg_parser = argparse.ArgumentParser(
+        description="OpenCV Python bindings generator"
+    )
+    arg_parser.add_argument(
+        "-c", "--config",
+        dest="config_json_path",
+        required=False,
+        help="Generator configuration file in .json format"
+        "Refer to PythonWrapperGenerator.Config for available "
+        "configuration keys"
+    )
+    arg_parser.add_argument(
+        "-o", "--output_dir",
+        dest="output_dir",
+        default=tempfile.gettempdir(),
+        help="Generated bindings output directory"
+    )
+    args = arg_parser.parse_args()
+    if args.config_json_path is not None:
+        with open(args.config_json_path, "r") as fh:
+            config_json = json.load(fh)
+        config = PythonWrapperGenerator.Config(**config_json)
+    else:
+        config = PythonWrapperGenerator.Config(
+            headers=hdr_parser.opencv_hdr_list
+        )
+
     generator = PythonWrapperGenerator()
-    generator.gen(srcfiles, dstdir)
+
+    generator.gen(config.headers, args.output_dir, config.preprocessor_definitions)
diff --git a/modules/python/src2/hdr_parser.py b/modules/python/src2/hdr_parser.py
index 196703518d..8c1db67303 100755
--- a/modules/python/src2/hdr_parser.py
+++ b/modules/python/src2/hdr_parser.py
@@ -33,11 +33,160 @@ where the list of modifiers is yet another nested list of strings
 original_return_type is None if the original_return_type is the same as return_value_type
 """
 
+def evaluate_conditional_inclusion_directive(directive, preprocessor_definitions):
+    """Evaluates C++ conditional inclusion directive.
+    Reference: https://en.cppreference.com/w/cpp/preprocessor/conditional
+
+    Args:
+        directive(str): input C++ conditional directive.
+        preprocessor_definitions(dict[str, int]): defined preprocessor identifiers.
+
+    Returns:
+        bool: True, if directive is evaluated to 1, False otherwise.
+
+    >>> evaluate_conditional_inclusion_directive("#ifdef    A", {"A": 0})
+    True
+
+    >>> evaluate_conditional_inclusion_directive("#ifdef A", {"B": 0})
+    False
+
+    >>> evaluate_conditional_inclusion_directive("#ifndef    A", {})
+    True
+
+    >>> evaluate_conditional_inclusion_directive("#ifndef A", {"A": 1})
+    False
+
+    >>> evaluate_conditional_inclusion_directive("#if 0", {})
+    False
+
+    >>> evaluate_conditional_inclusion_directive("#if 1", {})
+    True
+
+    >>> evaluate_conditional_inclusion_directive("#if    VAR", {"VAR": 0})
+    False
+
+    >>> evaluate_conditional_inclusion_directive("#if  VAR  ", {"VAR": 1})
+    True
+
+    >>> evaluate_conditional_inclusion_directive("#if defined(VAR)", {"VAR": 0})
+    True
+
+    >>> evaluate_conditional_inclusion_directive("#if !defined(VAR)", {"VAR": 0})
+    False
+
+    >>> evaluate_conditional_inclusion_directive("#if defined(VAR_1)", {"VAR_2": 0})
+    False
+
+    >>> evaluate_conditional_inclusion_directive(
+    ...     "#if defined(VAR) && VAR", {"VAR": 0}
+    ... )
+    False
+
+    >>> evaluate_conditional_inclusion_directive(
+    ...     "#if VAR_1 || VAR_2", {"VAR_1": 1, "VAR_2": 0}
+    ... )
+    True
+
+    >>> evaluate_conditional_inclusion_directive(
+    ...     "#if defined VAR && defined   (VAR)", {"VAR": 1}
+    ... )
+    True
+
+    >>> evaluate_conditional_inclusion_directive(
+    ...     "#if strangedefinedvar", {}
+    ... )
+    Traceback (most recent call last):
+        ...
+    ValueError: Failed to evaluate '#if strangedefinedvar' directive, stripped down to 'strangedefinedvar'
+    """
+    OPERATORS = { "!": "not ", "&&": "and", "&": "and", "||": "or", "|": "or" }
+
+    input_directive = directive
+
+    # Ignore all directives if they contain __cplusplus check
+    if "__cplusplus" in directive:
+        return True
+
+    directive = directive.strip()
+    if directive.startswith("#ifdef "):
+        var = directive[len("#ifdef "):].strip()
+        return var in preprocessor_definitions
+    if directive.startswith("#ifndef "):
+        var = directive[len("#ifndef "):].strip()
+        return var not in preprocessor_definitions
+
+    if directive.startswith("#if "):
+        directive = directive[len("#if "):].strip()
+    elif directive.startswith("#elif "):
+        directive = directive[len("#elif "):].strip()
+    else:
+        raise ValueError("{} is not known conditional directive".format(directive))
+
+    if directive.isdigit():
+        return int(directive) != 0
+
+    if directive in preprocessor_definitions:
+        return bool(preprocessor_definitions[directive])
+
+    # Converting all `defined` directives to their boolean representations
+    # they have 2 forms: `defined identifier` and `defined(identifier)`
+    directive = re.sub(
+        r"\bdefined\s*(\w+|\(\w+\))",
+        lambda m: "True" if m.group(1).strip("() ") in preprocessor_definitions else "False",
+        directive
+    )
+
+    for src_op, dst_op in OPERATORS.items():
+        directive = directive.replace(src_op, dst_op)
+
+    try:
+        if sys.version_info >= (3, 13):
+            eval_directive = eval(directive,
+                                  globals={"__builtins__": {}},
+                                  locals=preprocessor_definitions)
+        else:
+            eval_directive = eval(directive,
+                                  {"__builtins__": {}},
+                                  preprocessor_definitions)
+    except Exception as e:
+        raise ValueError(
+            "Failed to evaluate '{}' directive, stripped down to '{}'".format(
+                input_directive, directive
+            )
+        ) from e
+
+    if not isinstance(eval_directive, (bool, int)):
+        raise TypeError(
+            "'{}' directive is evaluated to unexpected type: {}".format(
+                input_directive, type(eval_directive).__name__
+            )
+        )
+    if isinstance(eval_directive, bool):
+        return eval_directive
+
+    return eval_directive != 0
+
+
 class CppHeaderParser(object):
 
-    def __init__(self, generate_umat_decls=False, generate_gpumat_decls=False):
+    def __init__(self, generate_umat_decls = False, generate_gpumat_decls = False,
+                 preprocessor_definitions = None):
         self._generate_umat_decls = generate_umat_decls
         self._generate_gpumat_decls = generate_gpumat_decls
+        if preprocessor_definitions is None:
+            preprocessor_definitions = {}
+        elif not isinstance(preprocessor_definitions, dict):
+            raise TypeError(
+                "preprocessor_definitions should rather dictionary or None. "
+                "Got: {}".format(type(preprocessor_definitions).__name__)
+            )
+        self.preprocessor_definitions = preprocessor_definitions
+        if "__OPENCV_BUILD" not in self.preprocessor_definitions:
+            self.preprocessor_definitions["__OPENCV_BUILD"] = 0
+        if "OPENCV_BINDING_PARSER" not in self.preprocessor_definitions:
+            self.preprocessor_definitions["OPENCV_BINDING_PARSER"] = 1
+        if "OPENCV_BINDINGS_PARSER" not in self.preprocessor_definitions:
+            self.preprocessor_definitions["OPENCV_BINDINGS_PARSER"] = 1
 
         self.BLOCK_TYPE = 0
         self.BLOCK_NAME = 1
@@ -192,6 +341,8 @@ class CppHeaderParser(object):
                 angle_stack[-1] += 1
             elif arg_type == "struct":
                 arg_type += " " + w
+            elif prev_w in ["signed", "unsigned", "short", "long"] and w in ["char", "short", "int", "long"]:
+                arg_type += " " + w
             elif arg_type and arg_type != "~":
                 arg_name = " ".join(word_list[wi:])
                 break
@@ -839,9 +990,8 @@ class CppHeaderParser(object):
         """
         self.hname = hname
         decls = []
-        f = io.open(hname, 'rt', encoding='utf-8')
-        linelist = list(f.readlines())
-        f.close()
+        with io.open(hname, 'rt', encoding='utf-8') as f:
+            linelist = list(f.readlines())
 
         # states:
         SCAN = 0 # outside of a comment or preprocessor directive
@@ -859,7 +1009,6 @@ class CppHeaderParser(object):
         self.wrap_mode = wmode
 
         depth_if_0 = 0
-
         for l0 in linelist:
             self.lineno += 1
             #print(state, self.lineno, l0)
@@ -886,22 +1035,35 @@ class CppHeaderParser(object):
                     continue
                 state = SCAN
                 l = re.sub(r'//(.+)?', '', l).strip()  # drop // comment
-                if l in [
-                    '#if 0',
-                    '#if defined(__OPENCV_BUILD)', '#ifdef __OPENCV_BUILD',
-                    '#if !defined(OPENCV_BINDING_PARSER)', '#ifndef OPENCV_BINDING_PARSER',
-                ]:
+                if l.startswith("#if") or l.startswith("#elif"):
+                    if not evaluate_conditional_inclusion_directive(
+                        l, self.preprocessor_definitions
+                    ):
+                        # Condition evaluated to false
+                        state = DIRECTIVE_IF_0
+                        depth_if_0 = 1
+                elif l.startswith("#else"):
+                    # else in state == DIRECTIVE may occur only if previous
+                    # conditional inclusion directive was evaluated to True
                     state = DIRECTIVE_IF_0
                     depth_if_0 = 1
                 continue
 
             if state == DIRECTIVE_IF_0:
-                if l.startswith('#'):
-                    l = l[1:].strip()
-                    if l.startswith("if"):
+                if l.startswith("#"):
+                    if l.startswith("#if"):
                         depth_if_0 += 1
                         continue
-                    if l.startswith("endif"):
+                    elif l.startswith("#else") and depth_if_0 == 1:
+                        depth_if_0 = 0
+                        state = SCAN
+                    elif l.startswith("#elif") and depth_if_0 == 1:
+                        if evaluate_conditional_inclusion_directive(
+                            l, self.preprocessor_definitions
+                        ):
+                            depth_if_0 = 0
+                            state = SCAN
+                    elif l.startswith("#endif"):
                         depth_if_0 -= 1
                         if depth_if_0 == 0:
                             state = SCAN
@@ -1075,6 +1237,9 @@ class CppHeaderParser(object):
                     print()
 
 if __name__ == '__main__':
+    import doctest
+    doctest.testmod()
+
     parser = CppHeaderParser(generate_umat_decls=True, generate_gpumat_decls=True)
     decls = []
     for hname in opencv_hdr_list:
diff --git a/modules/python/src2/typing_stubs_generation/predefined_types.py b/modules/python/src2/typing_stubs_generation/predefined_types.py
index 7be97218cf..5d960dd07b 100644
--- a/modules/python/src2/typing_stubs_generation/predefined_types.py
+++ b/modules/python/src2/typing_stubs_generation/predefined_types.py
@@ -28,6 +28,7 @@ _PREDEFINED_TYPES = (
     PrimitiveTypeNode.int_("uint32_t"),
     PrimitiveTypeNode.int_("size_t"),
     PrimitiveTypeNode.int_("int64_t"),
+    PrimitiveTypeNode.int_("long long"),
     PrimitiveTypeNode.float_("float"),
     PrimitiveTypeNode.float_("double"),
     PrimitiveTypeNode.bool_("bool"),
diff --git a/modules/stereo/src/stereosgbm.cpp b/modules/stereo/src/stereosgbm.cpp
index 8bf047ed70..cb2d3bb104 100644
--- a/modules/stereo/src/stereosgbm.cpp
+++ b/modules/stereo/src/stereosgbm.cpp
@@ -507,6 +507,9 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
     int SW2 = params.calcSADWindowSize().width/2, SH2 = params.calcSADWindowSize().height/2;
     int npasses = params.isFullDP() ? 2 : 1;
 
+    CV_CheckGT(width - (params.minDisparity + params.numDisparities), params.calcSADWindowSize().width/2,
+     "Your input images are too small for your window size and max disparity, and will result in non-deterministic SGBM results");
+
     if( minX1 >= maxX1 )
     {
         disp1 = Scalar::all(INVALID_DISP_SCALED);
diff --git a/modules/stereo/test/test_stereomatching.cpp b/modules/stereo/test/test_stereomatching.cpp
index ad230509df..826c4c8a3b 100644
--- a/modules/stereo/test/test_stereomatching.cpp
+++ b/modules/stereo/test/test_stereomatching.cpp
@@ -920,6 +920,38 @@ protected:
 
 TEST(Calib3d_StereoSGBM, regression) { CV_StereoSGBMTest test; test.safe_run(); }
 
+TEST(Calib3d_StereoSGBM, deterministic) {
+    cv::Ptr<cv::StereoSGBM> matcher = cv::StereoSGBM::create(16, 11);
+
+    // Expect throw error (non-determinism case)
+    int widthNarrow = 28;
+    int height = 15;
+
+    cv::Mat leftNarrow(height, widthNarrow, CV_8UC1);
+    cv::Mat rightNarrow(height, widthNarrow, CV_8UC1);
+    randu(leftNarrow, cv::Scalar(0), cv::Scalar(255));
+    randu(rightNarrow, cv::Scalar(0), cv::Scalar(255));
+    cv::Mat disp;
+
+    EXPECT_THROW(matcher->compute(leftNarrow, rightNarrow, disp), cv::Exception);
+
+    // Deterministic case, image is sufficiently large for StereSGBM parameters
+    int widthWide = 40;
+    cv::Mat leftWide(height, widthWide, CV_8UC1);
+    cv::Mat rightWide(height, widthWide, CV_8UC1);
+    randu(leftWide, cv::Scalar(0), cv::Scalar(255));
+    randu(rightWide, cv::Scalar(0), cv::Scalar(255));
+    cv::Mat disp1, disp2;
+    for (int i = 0; i < 10; i++) {
+        matcher->compute(leftWide, rightWide, disp1);
+        matcher->compute(leftWide, rightWide, disp2);
+        cv::Mat dst;
+        cv::bitwise_xor(disp1, disp2, dst);
+        EXPECT_EQ(cv::countNonZero(dst), 0);
+    }
+
+}
+
 TEST(Calib3d_StereoSGBM_HH4, regression)
 {
     String path = cvtest::TS::ptr()->get_data_path() + "cv/stereomatching/datasets/teddy/";
diff --git a/modules/videoio/include/opencv2/videoio.hpp b/modules/videoio/include/opencv2/videoio.hpp
index 702c8e6e9d..bc3b1611c6 100644
--- a/modules/videoio/include/opencv2/videoio.hpp
+++ b/modules/videoio/include/opencv2/videoio.hpp
@@ -719,8 +719,14 @@ class CV_EXPORTS_W IStreamReader
 public:
     virtual ~IStreamReader();
 
-    /** @brief Read bytes from stream */
-    virtual long long read(char* buffer, long long size) = 0;
+    /** @brief Read bytes from stream
+     *
+     * @param buffer already allocated buffer of at least @p size bytes
+     * @param size maximum number of bytes to read
+     *
+     * @return actual number of read bytes
+     */
+    CV_WRAP virtual long long read(char* buffer, long long size) = 0;
 
     /** @brief Sets the stream position
      *
@@ -729,7 +735,7 @@ public:
      *
      * @see fseek
      */
-    virtual long long seek(long long offset, int origin) = 0;
+    CV_WRAP virtual long long seek(long long offset, int origin) = 0;
 };
 
 class IVideoCapture;
diff --git a/modules/videoio/misc/java/filelist_common b/modules/videoio/misc/java/filelist_common
new file mode 100644
index 0000000000..c19b6f83ef
--- /dev/null
+++ b/modules/videoio/misc/java/filelist_common
@@ -0,0 +1 @@
+misc/java/src/cpp/videoio_converters.hpp
diff --git a/modules/videoio/misc/java/gen_dict.json b/modules/videoio/misc/java/gen_dict.json
new file mode 100644
index 0000000000..0c18f4284a
--- /dev/null
+++ b/modules/videoio/misc/java/gen_dict.json
@@ -0,0 +1,40 @@
+{
+    "ManualFuncs" : {
+        "IStreamReader" : {
+            "IStreamReader" : {
+                "j_code"   : [
+                    "\n",
+                    "/**",
+                    " * Constructor of streaming callback object with abstract 'read' and 'seek' methods that should be implemented in Java code.<br>",
+                    " * <b>NOTE</b>: Implemented callbacks should be called from the creation thread to avoid JNI performance degradation",
+                    "*/",
+                    "protected IStreamReader() { nativeObj = 0; }",
+                    "\n"
+                ],
+                "jn_code": [],
+                "cpp_code": []
+            }
+        }
+    },
+    "func_arg_fix" : {
+        "read": { "buffer": {"ctype" : "byte[]"} }
+    },
+    "type_dict": {
+        "Ptr_IStreamReader": {
+            "j_type": "IStreamReader",
+            "jn_type": "IStreamReader",
+            "jni_name": "n_%(n)s",
+            "jni_type": "jobject",
+            "jni_var": "auto n_%(n)s = makePtr<JavaStreamReader>(env, source)",
+            "j_import": "org.opencv.videoio.IStreamReader"
+        },
+        "vector_VideoCaptureAPIs": {
+            "j_type": "List<Integer>",
+            "jn_type": "List<Integer>",
+            "jni_type": "jobject",
+            "jni_var": "std::vector< cv::VideoCaptureAPIs > %(n)s",
+            "suffix": "Ljava_util_List",
+            "v_type": "vector_VideoCaptureAPIs"
+        }
+    }
+}
diff --git a/modules/videoio/misc/java/src/cpp/videoio_converters.cpp b/modules/videoio/misc/java/src/cpp/videoio_converters.cpp
new file mode 100644
index 0000000000..9f7699a2ce
--- /dev/null
+++ b/modules/videoio/misc/java/src/cpp/videoio_converters.cpp
@@ -0,0 +1,97 @@
+#include "videoio_converters.hpp"
+
+class JNIEnvHandler
+{
+public:
+    JNIEnvHandler(JavaVM* _vm) : vm(_vm)
+    {
+        jint res = vm->GetEnv((void**)&env, JNI_VERSION_1_6);
+        if (res == JNI_EDETACHED)
+        {
+#ifdef __ANDROID__
+            res = vm->AttachCurrentThread(&env, NULL);
+#else
+            res = vm->AttachCurrentThread((void**)&env, NULL);
+#endif // __ANDROID__
+            detach = true;
+        }
+    }
+
+    ~JNIEnvHandler()
+    {
+        if (env && detach)
+        {
+            vm->DetachCurrentThread();
+        }
+    }
+
+    JavaVM* vm;
+    JNIEnv* env = nullptr;
+    bool detach = false;
+};
+
+JavaStreamReader::JavaStreamReader(JNIEnv* env, jobject _obj)
+{
+    obj = env->NewGlobalRef(_obj);
+    jclass cls = env->GetObjectClass(obj);
+    m_read = env->GetMethodID(cls, "read", "([BJ)J");
+    m_seek = env->GetMethodID(cls, "seek", "(JI)J");
+    env->GetJavaVM(&vm);
+}
+
+JavaStreamReader::~JavaStreamReader()
+{
+    JNIEnvHandler handler(vm);
+    JNIEnv* env = handler.env;
+    if (!env)
+        return;
+    env->DeleteGlobalRef(obj);
+}
+
+long long JavaStreamReader::read(char* buffer, long long size)
+{
+    if (!m_read)
+        return 0;
+    JNIEnvHandler handler(vm);
+    JNIEnv* env = handler.env;
+    if (!env)
+        return 0;
+    jbyteArray jBuffer = env->NewByteArray(static_cast<jsize>(size));
+    if (!jBuffer)
+        return 0;
+    jlong res = env->CallLongMethod(obj, m_read, jBuffer, size);
+    env->GetByteArrayRegion(jBuffer, 0, static_cast<jsize>(size), reinterpret_cast<jbyte*>(buffer));
+    env->DeleteLocalRef(jBuffer);
+    return res;
+}
+
+long long JavaStreamReader::seek(long long offset, int way)
+{
+    JNIEnvHandler handler(vm);
+    JNIEnv* env = handler.env;
+    if (!env)
+        return 0;
+    if (!m_seek)
+        return 0;
+    return env->CallLongMethod(obj, m_seek, offset, way);
+}
+
+// Same as dnn::vector_Target_to_List
+jobject vector_VideoCaptureAPIs_to_List(JNIEnv* env, std::vector<cv::VideoCaptureAPIs>& vs)
+{
+    static jclass juArrayList   = ARRAYLIST(env);
+    static jmethodID m_create   = CONSTRUCTOR(env, juArrayList);
+    jmethodID m_add       = LIST_ADD(env, juArrayList);
+
+    static jclass jInteger = env->FindClass("java/lang/Integer");
+    static jmethodID m_create_Integer = env->GetMethodID(jInteger, "<init>", "(I)V");
+
+    jobject result = env->NewObject(juArrayList, m_create, vs.size());
+    for (size_t i = 0; i < vs.size(); ++i)
+    {
+        jobject element = env->NewObject(jInteger, m_create_Integer, vs[i]);
+        env->CallBooleanMethod(result, m_add, element);
+        env->DeleteLocalRef(element);
+    }
+    return result;
+}
diff --git a/modules/videoio/misc/java/src/cpp/videoio_converters.hpp b/modules/videoio/misc/java/src/cpp/videoio_converters.hpp
new file mode 100644
index 0000000000..d1ec43e2be
--- /dev/null
+++ b/modules/videoio/misc/java/src/cpp/videoio_converters.hpp
@@ -0,0 +1,25 @@
+#ifndef VIDEOIO_CONVERTERS_HPP
+#define VIDEOIO_CONVERTERS_HPP
+
+#include <jni.h>
+#include "opencv_java.hpp"
+#include "opencv2/core.hpp"
+#include "opencv2/videoio/videoio.hpp"
+
+class JavaStreamReader : public cv::IStreamReader
+{
+public:
+    JavaStreamReader(JNIEnv* env, jobject obj);
+    ~JavaStreamReader();
+    long long read(char* buffer, long long size) CV_OVERRIDE;
+    long long seek(long long offset, int way) CV_OVERRIDE;
+
+private:
+    JavaVM* vm;
+    jobject obj;
+    jmethodID m_read, m_seek;
+};
+
+jobject vector_VideoCaptureAPIs_to_List(JNIEnv* env, std::vector<cv::VideoCaptureAPIs>& vs);
+
+#endif
diff --git a/modules/videoio/misc/java/test/VideoCaptureTest.java b/modules/videoio/misc/java/test/VideoCaptureTest.java
index 9609a55620..db862a35d8 100644
--- a/modules/videoio/misc/java/test/VideoCaptureTest.java
+++ b/modules/videoio/misc/java/test/VideoCaptureTest.java
@@ -1,27 +1,41 @@
 package org.opencv.test.videoio;
 
 import java.util.List;
+import java.io.File;
+import java.io.RandomAccessFile;
+import java.io.IOException;
+import java.io.FileNotFoundException;
 
+import org.opencv.core.Mat;
 import org.opencv.core.Size;
+import org.opencv.core.MatOfInt;
 import org.opencv.videoio.Videoio;
 import org.opencv.videoio.VideoCapture;
+import org.opencv.videoio.IStreamReader;
 
 import org.opencv.test.OpenCVTestCase;
 
 public class VideoCaptureTest extends OpenCVTestCase {
+    private final static String ENV_OPENCV_TEST_DATA_PATH = "OPENCV_TEST_DATA_PATH";
 
     private VideoCapture capture;
     private boolean isOpened;
     private boolean isSucceed;
+    private File testDataPath;
 
     @Override
     protected void setUp() throws Exception {
         super.setUp();
 
         capture = null;
-        isTestCaseEnabled = false;
         isSucceed = false;
         isOpened = false;
+
+        String envTestDataPath = System.getenv(ENV_OPENCV_TEST_DATA_PATH);
+
+        if(envTestDataPath == null) throw new Exception(ENV_OPENCV_TEST_DATA_PATH + " has to be defined!");
+
+        testDataPath = new File(envTestDataPath);
     }
 
     public void testGrab() {
@@ -61,4 +75,70 @@ public class VideoCaptureTest extends OpenCVTestCase {
         assertNotNull(capture);
     }
 
+    public void testConstructorStream() throws FileNotFoundException {
+        // Check backend is available
+        Integer apiPref = Videoio.CAP_ANY;
+        for (Integer backend : Videoio.getStreamBufferedBackends())
+        {
+            if (!Videoio.hasBackend(backend))
+                continue;
+            if (!Videoio.isBackendBuiltIn(backend))
+            {
+                int[] abi = new int[1], api = new int[1];
+                Videoio.getStreamBufferedBackendPluginVersion(backend, abi, api);
+                if (abi[0] < 1 || (abi[0] == 1 && api[0] < 2))
+                    continue;
+            }
+            apiPref = backend;
+            break;
+        }
+        if (apiPref == Videoio.CAP_ANY)
+        {
+            throw new TestSkipException();
+        }
+
+        RandomAccessFile f = new RandomAccessFile(new File(testDataPath, "cv/video/768x576.avi"), "r");
+
+        IStreamReader stream = new IStreamReader()
+        {
+            @Override
+            public long read(byte[] buffer, long size)
+            {
+                assertEquals(buffer.length, size);
+                try
+                {
+                    return Math.max(f.read(buffer), 0);
+                }
+                catch (IOException e)
+                {
+                    System.out.println(e.getMessage());
+                    return 0;
+                }
+            }
+
+            @Override
+            public long seek(long offset, int origin)
+            {
+                try
+                {
+                    if (origin == 0)
+                        f.seek(offset);
+                    return f.getFilePointer();
+                }
+                catch (IOException e)
+                {
+                    System.out.println(e.getMessage());
+                    return 0;
+                }
+            }
+        };
+        capture = new VideoCapture(stream, apiPref, new MatOfInt());
+        assertNotNull(capture);
+        assertTrue(capture.isOpened());
+
+        Mat frame = new Mat();
+        assertTrue(capture.read(frame));
+        assertEquals(frame.rows(), 576);
+        assertEquals(frame.cols(), 768);
+    }
 }
diff --git a/modules/videoio/src/cap_gstreamer.cpp b/modules/videoio/src/cap_gstreamer.cpp
index e4a325041c..a158119d3f 100644
--- a/modules/videoio/src/cap_gstreamer.cpp
+++ b/modules/videoio/src/cap_gstreamer.cpp
@@ -1162,7 +1162,7 @@ bool GStreamerCapture::retrieveFrame(int index, OutputArray dst)
         }
     }
 
-    CV_LOG_ERROR(NULL, "GStreamer(retrive): unrecognized index=" << index);
+    CV_LOG_ERROR(NULL, "GStreamer(retrieve): unrecognized index=" << index);
     return false;
 }
 
diff --git a/modules/videoio/src/cap_v4l.cpp b/modules/videoio/src/cap_v4l.cpp
index 5575b099e2..9450f85f5a 100644
--- a/modules/videoio/src/cap_v4l.cpp
+++ b/modules/videoio/src/cap_v4l.cpp
@@ -440,7 +440,7 @@ struct CvCaptureCAM_V4L CV_FINAL : public IVideoCapture
     bool convertableToRgb() const;
     void convertToRgb(const Buffer &currentBuffer);
 
-    bool havePendingFrame;  // true if next .grab() should be noop, .retrive() resets this flag
+    bool havePendingFrame;  // true if next .grab() should be noop, .retrieve() resets this flag
 };
 
 /***********************   Implementations  ***************************************/
diff --git a/platforms/android/build_sdk.py b/platforms/android/build_sdk.py
index 9de579e5fc..687038125e 100755
--- a/platforms/android/build_sdk.py
+++ b/platforms/android/build_sdk.py
@@ -138,7 +138,7 @@ class ABI:
     def __str__(self):
         return "%s (%s)" % (self.name, self.toolchain)
     def haveIPP(self):
-        return self.name == "x86" or self.name == "x86_64"
+        return self.name == "x86_64"
     def haveKleidiCV(self):
         return self.name == "arm64-v8a"
 
diff --git a/platforms/android/default.config.py b/platforms/android/default.config.py
index 9c7b9ad0ef..2d0b45d15e 100644
--- a/platforms/android/default.config.py
+++ b/platforms/android/default.config.py
@@ -2,5 +2,5 @@ ABIs = [
     ABI("2", "armeabi-v7a", None, 21, cmake_vars=dict(ANDROID_ABI='armeabi-v7a with NEON')),
     ABI("3", "arm64-v8a",   None, 21, cmake_vars=dict(ANDROID_SUPPORT_FLEXIBLE_PAGE_SIZES='ON')),
     ABI("5", "x86_64",      None, 21, cmake_vars=dict(ANDROID_SUPPORT_FLEXIBLE_PAGE_SIZES='ON')),
-    ABI("4", "x86",         None, 21),
+    ABI("4", "x86",         None, 21, cmake_vars=dict(WITH_IPP='OFF')),
 ]
diff --git a/platforms/android/fastcv.config.py b/platforms/android/fastcv.config.py
new file mode 100644
index 0000000000..c96b534e68
--- /dev/null
+++ b/platforms/android/fastcv.config.py
@@ -0,0 +1,6 @@
+ABIs = [
+    ABI("2", "armeabi-v7a", None, 21, cmake_vars=dict(ANDROID_ABI='armeabi-v7a with NEON', WITH_FASTCV='ON')),
+    ABI("3", "arm64-v8a",   None, 21, cmake_vars=dict(ANDROID_SUPPORT_FLEXIBLE_PAGE_SIZES='ON', WITH_FASTCV='ON')),
+    ABI("5", "x86_64",      None, 21, cmake_vars=dict(ANDROID_SUPPORT_FLEXIBLE_PAGE_SIZES='ON')),
+    ABI("4", "x86",         None, 21, cmake_vars=dict(WITH_IPP='OFF')),
+]
diff --git a/samples/python/tutorial_code/imgProc/match_template/match_template.py b/samples/python/tutorial_code/imgProc/match_template/match_template.py
index 25c6e3bc04..ee5ed894b3 100644
--- a/samples/python/tutorial_code/imgProc/match_template/match_template.py
+++ b/samples/python/tutorial_code/imgProc/match_template/match_template.py
@@ -86,8 +86,8 @@ def MatchingMethod(param):
     ## [match_loc]
 
     ## [imshow]
-    cv.rectangle(img_display, matchLoc, (matchLoc[0] + templ.shape[0], matchLoc[1] + templ.shape[1]), (0,0,0), 2, 8, 0 )
-    cv.rectangle(result, matchLoc, (matchLoc[0] + templ.shape[0], matchLoc[1] + templ.shape[1]), (0,0,0), 2, 8, 0 )
+    cv.rectangle(img_display, matchLoc, (matchLoc[0] + templ.shape[1], matchLoc[1] + templ.shape[0]), (0,0,0), 2, 8, 0 )
+    cv.rectangle(result, matchLoc, (matchLoc[0] + templ.shape[1], matchLoc[1] + templ.shape[0]), (0,0,0), 2, 8, 0 )
     cv.imshow(image_window, img_display)
     cv.imshow(result_window, result)
     ## [imshow]