Merge branch 4.x

2025-06-07 17:44:04 +08:00 · 2025-05-27 16:48:22 +03:00 · 2025-05-27 16:48:22 +03:00 · 6c69e2cc90
commit 6c69e2cc90
parent 349b44a485 344f8c6400
167 changed files with 7024 additions and 4687 deletions
--- a/3rdparty/fastcv/fastcv.cmake
+++ b/3rdparty/fastcv/fastcv.cmake
@ -1,23 +1,23 @@
 function(download_fastcv root_dir)

  # Commit SHA in the opencv_3rdparty repo
-  set(FASTCV_COMMIT "8d86e68dad8b80b8575a8d3cf401d3ee96c24148")
+  set(FASTCV_COMMIT "abe340d0fb7f19fa9315080e3c8616642e98a296")

  # Define actual FastCV versions
  if(ANDROID)
    if(AARCH64)
      message(STATUS "Download FastCV for Android aarch64")
-      set(FCV_PACKAGE_NAME  "fastcv_android_aarch64_2025_04_08.tgz")
-      set(FCV_PACKAGE_HASH  "e028966a1d1b2f3f0bc5967d316e8b64")
+      set(FCV_PACKAGE_NAME  "fastcv_android_aarch64_2025_04_29.tgz")
+      set(FCV_PACKAGE_HASH  "d9172a9a3e5d92d080a4192cc5691001")
    else()
      message(STATUS "Download FastCV for Android armv7")
-      set(FCV_PACKAGE_NAME  "fastcv_android_arm32_2025_04_08.tgz")
-      set(FCV_PACKAGE_HASH  "6fc1e812a4b3ef392469d2283e037ffe")
+      set(FCV_PACKAGE_NAME  "fastcv_android_arm32_2025_04_29.tgz")
+      set(FCV_PACKAGE_HASH  "246b5253233391cd2c74d01d49aee9c3")
    endif()
  elseif(UNIX AND NOT APPLE AND NOT IOS AND NOT XROS)
    if(AARCH64)
-      set(FCV_PACKAGE_NAME  "fastcv_linux_aarch64_2025_04_08.tgz")
-      set(FCV_PACKAGE_HASH  "062a26639cd2788beee2e0dd8743d680")
+      set(FCV_PACKAGE_NAME  "fastcv_linux_aarch64_2025_04_29.tgz")
+      set(FCV_PACKAGE_HASH  "e2ce60e25c8e4113a7af2bd243118f4c")
    else()
      message("FastCV: fastcv lib for 32-bit Linux is not supported for now!")
    endif()
--- a/3rdparty/libtiff/tif_hash_set.c
+++ b/3rdparty/libtiff/tif_hash_set.c
@ -146,7 +146,7 @@ TIFFHashSet *TIFFHashSetNew(TIFFHashSetHashFunc fnHashFunc,
    set->fnEqualFunc = fnEqualFunc ? fnEqualFunc : TIFFHashSetEqualPointer;
    set->fnFreeEltFunc = fnFreeEltFunc;
    set->nSize = 0;
-    set->tabList = (TIFFList **)(calloc(sizeof(TIFFList *), 53));
+    set->tabList = (TIFFList **)(calloc(53, sizeof(TIFFList *)));
    if (set->tabList == NULL)
    {
        free(set);
@ -367,7 +367,7 @@ static bool TIFFHashSetRehash(TIFFHashSet *set)
 {
    int nNewAllocatedSize = anPrimes[set->nIndiceAllocatedSize];
    TIFFList **newTabList =
-        (TIFFList **)(calloc(sizeof(TIFFList *), nNewAllocatedSize));
+        (TIFFList **)(calloc(nNewAllocatedSize, sizeof(TIFFList *)));
    if (newTabList == NULL)
        return false;
 #ifdef HASH_DEBUG
--- a/3rdparty/openjpeg/openjp2/jp2.c
+++ b/3rdparty/openjpeg/openjp2/jp2.c
@ -2873,7 +2873,7 @@ OPJ_BOOL opj_jp2_read_header(opj_stream_private_t *p_stream,
                              p_image,
                              p_manager);

-    if (p_image && *p_image) {
+    if (ret && p_image && *p_image) {
        /* Set Image Color Space */
        if (jp2->enumcs == 16) {
            (*p_image)->color_space = OPJ_CLRSPC_SRGB;
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -920,9 +920,9 @@ if(WITH_NDSRVP)
 endif()

 if(WITH_HAL_RVV)
-  ocv_debug_message(STATUS "Enable HAL RVV acceleration")
-  if(NOT ";${OpenCV_HAL};" MATCHES ";halrvv;")
-    set(OpenCV_HAL "halrvv;${OpenCV_HAL}")
+  ocv_debug_message(STATUS "Enable RVV HAL acceleration")
+  if(NOT ";${OpenCV_HAL};" MATCHES ";rvvhal;")
+    set(OpenCV_HAL "rvvhal;${OpenCV_HAL}")
  endif()
 endif()

@ -955,13 +955,13 @@ foreach(hal ${OpenCV_HAL})
    else()
      message(STATUS "NDSRVP: Andes GNU Toolchain DSP extension is not enabled, disabling ndsrvp...")
    endif()
-  elseif(hal STREQUAL "halrvv")
+  elseif(hal STREQUAL "rvvhal")
    if(";${CPU_BASELINE_FINAL};" MATCHES ";RVV;")
      add_subdirectory(hal/riscv-rvv)
      ocv_hal_register(RVV_HAL_LIBRARIES RVV_HAL_HEADERS RVV_HAL_INCLUDE_DIRS)
-      list(APPEND OpenCV_USED_HAL "HAL RVV (ver ${RVV_HAL_VERSION})")
+      list(APPEND OpenCV_USED_HAL "RVV HAL (ver ${RVV_HAL_VERSION})")
    else()
-      message(STATUS "HAL RVV: RVV is not available, disabling halrvv...")
+      message(STATUS "RVV HAL: RVV is not available, disabling RVV HAL...")
    endif()
  elseif(hal STREQUAL "ipp")
    add_subdirectory(hal/ipp)
--- a/cmake/OpenCVBindingsPreprocessorDefinitions.cmake
+++ b/cmake/OpenCVBindingsPreprocessorDefinitions.cmake
@ -0,0 +1,63 @@
+function(ocv_bindings_generator_populate_preprocessor_definitions
+         opencv_modules
+         output_variable)
+  set(defs "\"CV_VERSION_MAJOR\": ${OPENCV_VERSION_MAJOR}")
+
+  macro(ocv_add_definition name value)
+    set(defs "${defs},\n\"${name}\": ${value}")
+  endmacro()
+
+  ocv_add_definition(CV_VERSION_MINOR ${OPENCV_VERSION_MINOR})
+  ocv_add_definition(CV_VERSION_PATCH ${OPENCV_VERSION_PATCH})
+  ocv_add_definition(OPENCV_ABI_COMPATIBILITY "${OPENCV_VERSION_MAJOR}00")
+
+  foreach(module IN LISTS ${opencv_modules})
+    if(HAVE_${module})
+        string(TOUPPER "${module}" module)
+        ocv_add_definition("HAVE_${module}" 1)
+    endif()
+  endforeach()
+  if(HAVE_EIGEN)
+    ocv_add_definition(HAVE_EIGEN 1)
+    ocv_add_definition(EIGEN_WORLD_VERSION ${EIGEN_WORLD_VERSION})
+    ocv_add_definition(EIGEN_MAJOR_VERSION ${EIGEN_MAJOR_VERSION})
+    ocv_add_definition(EIGEN_MINOR_VERSION ${EIGEN_MINOR_VERSION})
+  else()
+    # Some checks in parsed headers might not be protected with HAVE_EIGEN check
+    ocv_add_definition(EIGEN_WORLD_VERSION 0)
+    ocv_add_definition(EIGEN_MAJOR_VERSION 0)
+    ocv_add_definition(EIGEN_MINOR_VERSION 0)
+  endif()
+  if(HAVE_LAPACK)
+    ocv_add_definition(HAVE_LAPACK 1)
+  endif()
+
+  if(OPENCV_DISABLE_FILESYSTEM_SUPPORT)
+    ocv_add_definition(OPENCV_HAVE_FILESYSTEM_SUPPORT 0)
+  else()
+    ocv_add_definition(OPENCV_HAVE_FILESYSTEM_SUPPORT 1)
+  endif()
+
+  ocv_add_definition(OPENCV_BINDINGS_PARSER 1)
+
+  # Implementation details definitions, having no impact on how bindings are
+  # generated, so their real values can be safely ignored
+  ocv_add_definition(CV_ENABLE_UNROLLED 0)
+  ocv_add_definition(CV__EXCEPTION_PTR 0)
+  ocv_add_definition(CV_NEON 0)
+  ocv_add_definition(TBB_INTERFACE_VERSION 0)
+  ocv_add_definition(CV_SSE2 0)
+  ocv_add_definition(CV_VSX 0)
+  ocv_add_definition(OPENCV_SUPPORTS_FP_DENORMALS_HINT 0)
+  ocv_add_definition(CV_LOG_STRIP_LEVEL 0)
+  ocv_add_definition(CV_LOG_LEVEL_SILENT 0)
+  ocv_add_definition(CV_LOG_LEVEL_FATAL 1)
+  ocv_add_definition(CV_LOG_LEVEL_ERROR 2)
+  ocv_add_definition(CV_LOG_LEVEL_WARN 3)
+  ocv_add_definition(CV_LOG_LEVEL_INFO 4)
+  ocv_add_definition(CV_LOG_LEVEL_DEBUG 5)
+  ocv_add_definition(CV_LOG_LEVEL_VERBOSE 6)
+  ocv_add_definition(CERES_FOUND 0)
+
+  set(${output_variable} ${defs} PARENT_SCOPE)
+endfunction()
--- a/cmake/OpenCVFindLibsGrfmt.cmake
+++ b/cmake/OpenCVFindLibsGrfmt.cmake
@ -297,6 +297,9 @@ if(WITH_SPNG)
    else()
      if(PkgConfig_FOUND)
        pkg_check_modules(SPNG QUIET spng)
+        if(SPNG_FOUND)
+          set(SPNG_LIBRARY ${SPNG_LIBRARIES} CACHE INTERNAL "")
+        endif()
      endif()
    endif()
    if(SPNG_FOUND)
--- a/cmake/OpenCVFindLibsPerf.cmake
+++ b/cmake/OpenCVFindLibsPerf.cmake
@ -197,11 +197,13 @@ if(WITH_FASTCV)
      ocv_install_3rdparty_licenses(FastCV "${OpenCV_BINARY_DIR}/3rdparty/fastcv/LICENSE")
      add_library(fastcv STATIC IMPORTED)
      set_target_properties(fastcv PROPERTIES
-          IMPORTED_LINK_INTERFACE_LIBRARIES ""
+          IMPORTED_LINK_INTERFACE_LIBRARIES "dl"
          IMPORTED_LOCATION "${FastCV_LIB_PATH}/libfastcv.a"
      )
      if (NOT BUILD_SHARED_LIBS)
        install(FILES "${FastCV_LIB_PATH}/libfastcv.a" DESTINATION "${OPENCV_3P_LIB_INSTALL_PATH}" COMPONENT "dev")
+        set(FASTCV_LOCATION_PATH "${FastCV_LIB_PATH}/libfastcv.a" CACHE INTERNAL "" FORCE)
+        set(FASTCV_INSTALL_PATH "${CMAKE_INSTALL_PREFIX}/${OPENCV_3P_LIB_INSTALL_PATH}/libfastcv.a" CACHE INTERNAL "" FORCE)
      endif()
      set(FASTCV_LIBRARY "fastcv" CACHE PATH "FastCV library")
      list(APPEND OPENCV_LINKER_LIBS ${FASTCV_LIBRARY})
--- a/cmake/OpenCVGenConfig.cmake
+++ b/cmake/OpenCVGenConfig.cmake
@ -68,6 +68,14 @@ else()
  set(USE_IPPIW FALSE)
 endif()

+if(TARGET fastcv AND NOT BUILD_SHARED_LIBS)
+  file(RELATIVE_PATH FASTCV_INSTALL_PATH_RELATIVE_CONFIGCMAKE "${CMAKE_BINARY_DIR}" "${FASTCV_LOCATION_PATH}")
+  ocv_cmake_configure("${CMAKE_CURRENT_LIST_DIR}/templates/OpenCVConfig-FastCV.cmake.in" FASTCV_CONFIGCMAKE @ONLY)
+  set(USE_FASTCV TRUE)
+else()
+  set(USE_FASTCV FALSE)
+endif()
+
 ocv_cmake_hook(PRE_CMAKE_CONFIG_BUILD)
 configure_file("${OpenCV_SOURCE_DIR}/cmake/templates/OpenCVConfig.cmake.in" "${CMAKE_BINARY_DIR}/OpenCVConfig.cmake" @ONLY)
 #support for version checking when finding opencv. find_package(OpenCV 2.3.1 EXACT) should now work.
@ -92,6 +100,11 @@ if(USE_IPPIW)
  ocv_cmake_configure("${CMAKE_CURRENT_LIST_DIR}/templates/OpenCVConfig-IPPIW.cmake.in" IPPIW_CONFIGCMAKE @ONLY)
 endif()

+if(USE_FASTCV)
+  file(RELATIVE_PATH FASTCV_INSTALL_PATH_RELATIVE_CONFIGCMAKE "${CMAKE_INSTALL_PREFIX}" "${FASTCV_INSTALL_PATH}")
+  ocv_cmake_configure("${CMAKE_CURRENT_LIST_DIR}/templates/OpenCVConfig-FastCV.cmake.in" FASTCV_CONFIGCMAKE @ONLY)
+endif()
+
 function(ocv_gen_config TMP_DIR NESTED_PATH ROOT_NAME)
  ocv_path_join(__install_nested "${OPENCV_CONFIG_INSTALL_PATH}" "${NESTED_PATH}")
  ocv_path_join(__tmp_nested "${TMP_DIR}" "${NESTED_PATH}")
--- a/cmake/OpenCVPackaging.cmake
+++ b/cmake/OpenCVPackaging.cmake
@ -12,7 +12,7 @@ if(NOT OPENCV_CUSTOM_PACKAGE_INFO)
 "OpenCV (Open Source Computer Vision Library) is an open source computer vision
 and machine learning software library. OpenCV was built to provide a common
 infrastructure for computer vision applications and to accelerate the use of
-machine perception in the commercial products. Being a BSD-licensed product,
+machine perception in the commercial products. Being a Apache 2.0 -licensed product,
 OpenCV makes it easy for businesses to utilize and modify the code.")
  set(CPACK_PACKAGE_VENDOR "OpenCV Foundation")
  set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE")
--- a/cmake/templates/OpenCVConfig-FastCV.cmake.in
+++ b/cmake/templates/OpenCVConfig-FastCV.cmake.in
@ -0,0 +1,7 @@
+if(NOT TARGET fastcv)
+  add_library(fastcv STATIC IMPORTED)
+  set_target_properties(fastcv PROPERTIES
+    IMPORTED_LINK_INTERFACE_LIBRARIES ""
+    IMPORTED_LOCATION "${OpenCV_INSTALL_PATH}/@FASTCV_INSTALL_PATH_RELATIVE_CONFIGCMAKE@"
+  )
+endif()
--- a/cmake/templates/OpenCVConfig.cmake.in
+++ b/cmake/templates/OpenCVConfig.cmake.in
@ -99,6 +99,8 @@ endif()
@IPPICV_CONFIGCMAKE@
@IPPIW_CONFIGCMAKE@

+@FASTCV_CONFIGCMAKE@
+
 # Some additional settings are required if OpenCV is built as static libs
 set(OpenCV_SHARED @BUILD_SHARED_LIBS@)

--- a/doc/tutorials/dnn/dnn_openvino/dnn_openvino.markdown
+++ b/doc/tutorials/dnn/dnn_openvino/dnn_openvino.markdown
@ -26,3 +26,14 @@ There are 2 approaches how to get OpenCV:
 - Build OpenCV from source code against specific version of OpenVINO. This approach solves the limitations mentioned above.

 The instruction how to follow both approaches is provided in [OpenCV wiki](https://github.com/opencv/opencv/wiki/BuildOpenCV4OpenVINO).
+
+## Supported targets
+
+OpenVINO backend (DNN_BACKEND_INFERENCE_ENGINE) supports the following [targets](https://docs.opencv.org/4.x/d6/d0f/group__dnn.html#ga709af7692ba29788182cf573531b0ff5):
+
+- **DNN_TARGET_CPU:** Runs on the CPU, no additional dependencies required.
+- **DNN_TARGET_OPENCL, DNN_TARGET_OPENCL_FP16:** Runs on the iGPU, requires OpenCL drivers. Install [intel-opencl-icd](https://launchpad.net/ubuntu/jammy/+package/intel-opencl-icd) on Ubuntu.
+- **DNN_TARGET_MYRIAD:** Runs on Intel&reg; VPU like the [Neural Compute Stick](https://www.intel.com/content/www/us/en/products/sku/140109/intel-neural-compute-stick-2/specifications.html), to set up [see](https://www.intel.com/content/www/us/en/developer/archive/tools/neural-compute-stick.html).
+- **DNN_TARGET_HDDL:** Runs on the Intel&reg; Movidius&trade; Myriad&trade; X High Density Deep Learning VPU, for details [see](https://intelsmartedge.github.io/ido-specs/doc/building-blocks/enhanced-platform-awareness/smartedge-open_hddl/).
+- **DNN_TARGET_FPGA:** Runs on Intel&reg; Altera&reg; series FPGAs [see](https://www.intel.com/content/www/us/en/docs/programmable/768970/2025-1/getting-started-guide.html).
+- **DNN_TARGET_NPU:** Runs on the integrated Intel&reg; AI Boost processor, requires [Linux drivers](https://github.com/intel/linux-npu-driver/releases/tag/v1.17.0) OR [Windows drivers](https://www.intel.com/content/www/us/en/download/794734/intel-npu-driver-windows.html).
--- a/doc/tutorials/dnn/dnn_yolo/dnn_yolo.markdown
+++ b/doc/tutorials/dnn/dnn_yolo/dnn_yolo.markdown
@ -41,7 +41,7 @@ Assuming that we have successfully trained YOLOX model, the subsequent step invo
 running this model with OpenCV. There are several critical considerations to address before
 proceeding with this process. Let's delve into these aspects.

-### YOLO's Pre-proccessing & Output
+### YOLO's Pre-processing & Output

 Understanding the nature of inputs and outputs associated with YOLO family detectors is pivotal.
 These detectors, akin to most Deep Neural Networks (DNN), typically exhibit variation in input
--- a/doc/tutorials/introduction/building_fastcv/building_fastcv.markdown
+++ b/doc/tutorials/introduction/building_fastcv/building_fastcv.markdown
@ -144,9 +144,9 @@ HAL and Extension list of APIs
 |               |pyrUp & pyrDown   |fcvPyramidCreateu8_v4                          |
 |               |cvtColor          |fcvColorRGB888toYCrCbu8_v3                     |
 |               |                  |fcvColorRGB888ToHSV888u8                       |
-|               |GaussianBlur      |fcvFilterGaussian5x5u8_v3                      |
+|               |gaussianBlur      |fcvFilterGaussian5x5u8_v3                      |
 |               |                  |fcvFilterGaussian3x3u8_v4                      |
-|               |cvWarpPerspective |fcvWarpPerspectiveu8_v5                        |
+|               |warpPerspective   |fcvWarpPerspectiveu8_v5                        |
 |               |Canny             |fcvFilterCannyu8                               |
 |               |                  |                                               |
 |CORE           |lut               | fcvTableLookupu8                              |
@ -166,6 +166,7 @@ HAL and Extension list of APIs
 |               |                  |fcvElementMultiplyf32                          |
 |               |addWeighted       |fcvAddWeightedu8_v2                            |
 |               |subtract          |fcvImageDiffu8f32_v2                           |
+|               |SVD & solve       |fcvSVDf32_v2                                   |


 **FastCV based OpenCV Extensions APIs list :**
@ -221,10 +222,10 @@ HAL and Extension list of APIs
 |                      |fcvFilterCorrSep17x17s16_v2                   |
 |                      |fcvFilterCorrSepNxNs16                        |
 |sobel3x3u8            |fcvImageGradientSobelPlanars8_v2              |
-|sobel3x3u9            |fcvImageGradientSobelPlanars16_v2             |
-|sobel3x3u10           |fcvImageGradientSobelPlanars16_v3             |
-|sobel3x3u11           |fcvImageGradientSobelPlanarf32_v2             |
-|sobel3x3u12           |fcvImageGradientSobelPlanarf32_v3             |
+|sobel3x3u8            |fcvImageGradientSobelPlanars16_v2             |
+|sobel3x3u8            |fcvImageGradientSobelPlanars16_v3             |
+|sobel3x3u8            |fcvImageGradientSobelPlanarf32_v2             |
+|sobel3x3u8            |fcvImageGradientSobelPlanarf32_v3             |
 |sobel                 |fcvFilterSobel3x3u8_v2                        |
 |                      |fcvFilterSobel3x3u8s16                        |
 |                      |fcvFilterSobel5x5u8s16                        |
@ -244,3 +245,4 @@ HAL and Extension list of APIs
 |trackOpticalFlowLK    |fcvTrackLKOpticalFlowu8_v3                    |
 |                      |fcvTrackLKOpticalFlowu8                       |
 |warpPerspective2Plane |fcv2PlaneWarpPerspectiveu8                    |
+|warpPerspective       |fcvWarpPerspectiveu8_v5                       |
--- a/hal/carotene/include/carotene/functions.hpp
+++ b/hal/carotene/include/carotene/functions.hpp
@ -1040,7 +1040,7 @@ namespace CAROTENE_NS {
                        s32 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity);

    /*
-        Among each pixel `p` within `src` find min and max values and its first occurences
+        Among each pixel `p` within `src` find min and max values and its first occurrences
    */
    void minMaxLoc(const Size2D &size,
                   const s8 * srcBase, ptrdiff_t srcStride,
--- a/hal/ipp/CMakeLists.txt
+++ b/hal/ipp/CMakeLists.txt
@ -13,6 +13,7 @@ add_library(ipphal STATIC
    "${CMAKE_CURRENT_SOURCE_DIR}/src/norm_ipp.cpp"
    "${CMAKE_CURRENT_SOURCE_DIR}/src/cart_polar_ipp.cpp"
    "${CMAKE_CURRENT_SOURCE_DIR}/src/transforms_ipp.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/src/sum_ipp.cpp"
 )

 #TODO: HAVE_IPP_ICV and HAVE_IPP_IW added as private macro till OpenCV itself is
--- a/hal/ipp/include/ipp_hal_core.hpp
+++ b/hal/ipp/include/ipp_hal_core.hpp
@ -1,3 +1,7 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
 #ifndef __IPP_HAL_CORE_HPP__
 #define __IPP_HAL_CORE_HPP__

@ -32,6 +36,11 @@ int ipp_hal_normDiff(const uchar* src1, size_t src1_step, const uchar* src2, siz
 #undef cv_hal_normDiff
 #define cv_hal_normDiff ipp_hal_normDiff

+int ipp_hal_sum(const uchar *src_data, size_t src_step, int src_type, int width, int height, double *result);
+
+#undef cv_hal_sum
+#define cv_hal_sum ipp_hal_sum
+
 #endif

 int ipp_hal_polarToCart32f(const float* mag, const float* angle, float* x, float* y, int len, bool angleInDegrees);
@ -56,4 +65,6 @@ int ipp_hal_transpose2d(const uchar* src_data, size_t src_step, uchar* dst_data,
 #undef cv_hal_transpose2d
 #define cv_hal_transpose2d ipp_hal_transpose2d

+//! @endcond
+
 #endif
--- a/hal/ipp/include/ipp_utils.hpp
+++ b/hal/ipp/include/ipp_utils.hpp
@ -1,3 +1,7 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
 #ifndef __IPP_HAL_UTILS_HPP__
 #define __IPP_HAL_UTILS_HPP__

--- a/hal/ipp/src/cart_polar_ipp.cpp
+++ b/hal/ipp/src/cart_polar_ipp.cpp
@ -1,3 +1,7 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
 #include "ipp_hal_core.hpp"

 #include <opencv2/core/core.hpp>
--- a/hal/ipp/src/mean_ipp.cpp
+++ b/hal/ipp/src/mean_ipp.cpp
@ -1,3 +1,7 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
 #include "ipp_hal_core.hpp"

 #include <opencv2/core.hpp>
--- a/hal/ipp/src/minmax_ipp.cpp
+++ b/hal/ipp/src/minmax_ipp.cpp
@ -1,3 +1,7 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
 #include "ipp_hal_core.hpp"

 #include <opencv2/core.hpp>
--- a/hal/ipp/src/norm_ipp.cpp
+++ b/hal/ipp/src/norm_ipp.cpp
@ -1,3 +1,7 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
 #include "ipp_hal_core.hpp"

 #include <opencv2/core.hpp>
--- a/hal/ipp/src/sum_ipp.cpp
+++ b/hal/ipp/src/sum_ipp.cpp
@ -0,0 +1,59 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#include "ipp_hal_core.hpp"
+
+#include <opencv2/core.hpp>
+#include <opencv2/core/base.hpp>
+
+#if IPP_VERSION_X100 >= 700
+
+int ipp_hal_sum(const uchar *src_data, size_t src_step, int src_type, int width, int height, double *result)
+{
+    int cn = CV_MAT_CN(src_type);
+    if (cn > 4)
+    {
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    }
+
+    IppiSize sz = { width, height };
+
+    typedef IppStatus (CV_STDCALL* ippiSumFuncHint)(const void*, int, IppiSize, double *, IppHintAlgorithm);
+    typedef IppStatus (CV_STDCALL* ippiSumFuncNoHint)(const void*, int, IppiSize, double *);
+    ippiSumFuncHint ippiSumHint =
+        src_type == CV_32FC1 ? (ippiSumFuncHint)ippiSum_32f_C1R :
+        src_type == CV_32FC3 ? (ippiSumFuncHint)ippiSum_32f_C3R :
+        src_type == CV_32FC4 ? (ippiSumFuncHint)ippiSum_32f_C4R :
+        0;
+    ippiSumFuncNoHint ippiSum =
+        src_type == CV_8UC1 ? (ippiSumFuncNoHint)ippiSum_8u_C1R :
+        src_type == CV_8UC3 ? (ippiSumFuncNoHint)ippiSum_8u_C3R :
+        src_type == CV_8UC4 ? (ippiSumFuncNoHint)ippiSum_8u_C4R :
+        src_type == CV_16UC1 ? (ippiSumFuncNoHint)ippiSum_16u_C1R :
+        src_type == CV_16UC3 ? (ippiSumFuncNoHint)ippiSum_16u_C3R :
+        src_type == CV_16UC4 ? (ippiSumFuncNoHint)ippiSum_16u_C4R :
+        src_type == CV_16SC1 ? (ippiSumFuncNoHint)ippiSum_16s_C1R :
+        src_type == CV_16SC3 ? (ippiSumFuncNoHint)ippiSum_16s_C3R :
+        src_type == CV_16SC4 ? (ippiSumFuncNoHint)ippiSum_16s_C4R :
+        0;
+
+    if( ippiSumHint || ippiSum )
+    {
+        IppStatus ret = ippiSumHint ?
+        CV_INSTRUMENT_FUN_IPP(ippiSumHint, src_data, (int)src_step, sz, result, ippAlgHintAccurate) :
+        CV_INSTRUMENT_FUN_IPP(ippiSum, src_data, (int)src_step, sz, result);
+        if( ret >= 0 )
+        {
+            return CV_HAL_ERROR_OK;
+        }
+        else
+        {
+            return CV_HAL_ERROR_NOT_IMPLEMENTED;
+        }
+    }
+
+    return CV_HAL_ERROR_NOT_IMPLEMENTED;
+}
+
+#endif
--- a/hal/ipp/src/transforms_ipp.cpp
+++ b/hal/ipp/src/transforms_ipp.cpp
@ -1,3 +1,7 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
 #include "ipp_hal_core.hpp"

 #include <opencv2/core.hpp>
--- a/hal/kleidicv/CMakeLists.txt
+++ b/hal/kleidicv/CMakeLists.txt
@ -2,6 +2,7 @@ project(kleidicv_hal)

 if(HAVE_KLEIDICV)
  option(KLEIDICV_ENABLE_SME2 "" OFF) # not compatible with some CLang versions in NDK
+  option(KLEIDICV_USE_CV_NAMESPACE_IN_OPENCV_HAL "" OFF)
  include("${KLEIDICV_SOURCE_PATH}/adapters/opencv/CMakeLists.txt")
  # HACK to suppress adapters/opencv/kleidicv_hal.cpp:343:12: warning: unused function 'from_opencv' [-Wunused-function]
  target_compile_options( kleidicv_hal PRIVATE
--- a/hal/ndsrvp/src/bilateralFilter.cpp
+++ b/hal/ndsrvp/src/bilateralFilter.cpp
@ -156,10 +156,12 @@ int bilateralFilter(const uchar* src_data, size_t src_step,

    int i, j, maxk, radius;

-    if( sigma_color <= 0 )
-        sigma_color = 1;
-    if( sigma_space <= 0 )
-        sigma_space = 1;
+    constexpr double eps = 1e-6;
+    if( sigma_color <= eps || sigma_space <= eps )
+    {
+        src.copyTo(dst);
+        return CV_HAL_ERROR_OK;
+    }

    double gauss_color_coeff = -0.5/(sigma_color * sigma_color);
    double gauss_space_coeff = -0.5/(sigma_space * sigma_space);
--- a/hal/riscv-rvv/CMakeLists.txt
+++ b/hal/riscv-rvv/CMakeLists.txt
@ -1,9 +1,26 @@
 cmake_minimum_required(VERSION ${MIN_VER_CMAKE} FATAL_ERROR)

-set(HAL_LIB_NAME "")
+set(RVV_HAL_INCLUDE_DIR include)
+set(RVV_HAL_SOURCE_DIR src)
+
+file(GLOB rvv_hal_headers RELATIVE "${CMAKE_CURRENT_LIST_DIR}" "${RVV_HAL_INCLUDE_DIR}/*.hpp")
+file(GLOB rvv_hal_sources RELATIVE "${CMAKE_CURRENT_LIST_DIR}" "${RVV_HAL_SOURCE_DIR}/**/*.cpp")
+
+set(HAL_LIB_NAME "rvv_hal")
+add_library(${HAL_LIB_NAME} STATIC)
+target_sources(${HAL_LIB_NAME} PRIVATE ${rvv_hal_headers} ${rvv_hal_sources})
+
+set_target_properties(${HAL_LIB_NAME} PROPERTIES ARCHIVE_OUTPUT_DIRECTORY ${3P_LIBRARY_OUTPUT_PATH})
+if(NOT BUILD_SHARED_LIBS)
+  ocv_install_target(${HAL_LIB_NAME} EXPORT OpenCVModules ARCHIVE DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH} COMPONENT dev)
+endif()
+target_include_directories(${HAL_LIB_NAME} PRIVATE
+  ${CMAKE_CURRENT_SOURCE_DIR}
+  ${CMAKE_SOURCE_DIR}/modules/core/include
+  ${CMAKE_SOURCE_DIR}/modules/imgproc/include) #   ${CMAKE_SOURCE_DIR}/modules/features2d/include

 set(RVV_HAL_FOUND TRUE CACHE INTERNAL "")
 set(RVV_HAL_VERSION "0.0.1" CACHE INTERNAL "")
 set(RVV_HAL_LIBRARIES ${HAL_LIB_NAME} CACHE INTERNAL "")
-set(RVV_HAL_HEADERS "hal_rvv.hpp" CACHE INTERNAL "")
-set(RVV_HAL_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}" "${CMAKE_SOURCE_DIR}/modules/imgproc/include" CACHE INTERNAL "")
+set(RVV_HAL_HEADERS "rvv_hal.hpp" CACHE INTERNAL "")
+set(RVV_HAL_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}" CACHE INTERNAL "")
--- a/hal/riscv-rvv/hal_rvv.hpp
+++ b/hal/riscv-rvv/hal_rvv.hpp
@ -1,65 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.
-
-#ifndef OPENCV_HAL_RVV_HPP_INCLUDED
-#define OPENCV_HAL_RVV_HPP_INCLUDED
-
-#include "opencv2/core/base.hpp"
-#include "opencv2/core/hal/interface.h"
-#include "opencv2/imgproc/hal/interface.h"
-
-#ifndef CV_HAL_RVV_071_ENABLED
-#  if defined(__GNUC__) && __GNUC__ == 10 && __GNUC_MINOR__ == 4 && defined(__THEAD_VERSION__) && defined(__riscv_v) && __riscv_v == 7000
-#    define CV_HAL_RVV_071_ENABLED 1
-#  else
-#    define CV_HAL_RVV_071_ENABLED 0
-#  endif
-#endif
-
-#if CV_HAL_RVV_071_ENABLED
-#include "version/hal_rvv_071.hpp"
-#endif
-
-#if defined(__riscv_v) && __riscv_v == 1000000
-#include "hal_rvv_1p0/types.hpp"
-#include "hal_rvv_1p0/merge.hpp" // core
-#include "hal_rvv_1p0/mean.hpp" // core
-#include "hal_rvv_1p0/dxt.hpp" // core
-#include "hal_rvv_1p0/norm.hpp" // core
-#include "hal_rvv_1p0/norm_diff.hpp" // core
-#include "hal_rvv_1p0/norm_hamming.hpp" // core
-#include "hal_rvv_1p0/convert_scale.hpp" // core
-#include "hal_rvv_1p0/minmax.hpp" // core
-#include "hal_rvv_1p0/atan.hpp" // core
-#include "hal_rvv_1p0/split.hpp" // core
-#include "hal_rvv_1p0/magnitude.hpp" // core
-#include "hal_rvv_1p0/cart_to_polar.hpp" // core
-#include "hal_rvv_1p0/polar_to_cart.hpp" // core
-#include "hal_rvv_1p0/flip.hpp" // core
-#include "hal_rvv_1p0/lut.hpp" // core
-#include "hal_rvv_1p0/exp.hpp" // core
-#include "hal_rvv_1p0/log.hpp" // core
-#include "hal_rvv_1p0/lu.hpp" // core
-#include "hal_rvv_1p0/cholesky.hpp" // core
-#include "hal_rvv_1p0/qr.hpp" // core
-#include "hal_rvv_1p0/svd.hpp" // core
-#include "hal_rvv_1p0/sqrt.hpp" // core
-#include "hal_rvv_1p0/copy_mask.hpp" // core
-#include "hal_rvv_1p0/div.hpp" // core
-#include "hal_rvv_1p0/dotprod.hpp" // core
-#include "hal_rvv_1p0/compare.hpp" // core
-#include "hal_rvv_1p0/transpose.hpp" // core
-
-#include "hal_rvv_1p0/moments.hpp" // imgproc
-#include "hal_rvv_1p0/filter.hpp" // imgproc
-#include "hal_rvv_1p0/pyramids.hpp" // imgproc
-#include "hal_rvv_1p0/color.hpp" // imgproc
-#include "hal_rvv_1p0/warp.hpp" // imgproc
-#include "hal_rvv_1p0/thresh.hpp" // imgproc
-#include "hal_rvv_1p0/histogram.hpp" // imgproc
-#include "hal_rvv_1p0/resize.hpp" // imgproc
-#include "hal_rvv_1p0/integral.hpp" // imgproc
-#endif
-
-#endif
--- a/hal/riscv-rvv/hal_rvv_1p0/atan.hpp
+++ b/hal/riscv-rvv/hal_rvv_1p0/atan.hpp
@ -1,128 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level
-// directory of this distribution and at http://opencv.org/license.html.
-
-// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
-
-#ifndef OPENCV_HAL_RVV_ATAN_HPP_INCLUDED
-#define OPENCV_HAL_RVV_ATAN_HPP_INCLUDED
-
-#undef cv_hal_fastAtan32f
-#define cv_hal_fastAtan32f cv::cv_hal_rvv::fast_atan_32
-
-#undef cv_hal_fastAtan64f
-#define cv_hal_fastAtan64f cv::cv_hal_rvv::fast_atan_64
-
-#include <riscv_vector.h>
-
-#include <cfloat>
-
-namespace cv { namespace cv_hal_rvv {
-
-namespace detail {
-// ref: mathfuncs_core.simd.hpp
-static constexpr float pi = CV_PI;
-
-struct AtanParams
-{
-    float p1, p3, p5, p7, angle_90;
-};
-
-static constexpr AtanParams atan_params_rad {
-    0.9997878412794807F,
-    -0.3258083974640975F,
-    0.1555786518463281F,
-    -0.04432655554792128F,
-    90.F * (pi / 180.F)};
-static constexpr AtanParams atan_params_deg {
-    atan_params_rad.p1 * (180 / pi),
-    atan_params_rad.p3 * (180 / pi),
-    atan_params_rad.p5 * (180 / pi),
-    atan_params_rad.p7 * (180 / pi),
-    90.F};
-
-template <typename VEC_T>
-__attribute__((always_inline)) inline VEC_T
-    rvv_atan(VEC_T vy, VEC_T vx, size_t vl, const AtanParams& params)
-{
-    const auto ax = __riscv_vfabs(vx, vl);
-    const auto ay = __riscv_vfabs(vy, vl);
-    // Reciprocal Estimate (vfrec7) is not accurate enough to pass the test of cartToPolar.
-    const auto c = __riscv_vfdiv(__riscv_vfmin(ax, ay, vl),
-                                 __riscv_vfadd(__riscv_vfmax(ax, ay, vl), FLT_EPSILON, vl),
-                                 vl);
-    const auto c2 = __riscv_vfmul(c, c, vl);
-
-    // Using vfmadd only results in about a 2% performance improvement, but it occupies 3 additional
-    // M4 registers. (Performance test on phase32f::VectorLength::1048576: time decreased
-    // from 5.952ms to 5.805ms on Muse Pi)
-    // Additionally, when registers are nearly fully utilized (though not yet exhausted), the
-    // compiler is likely to fail to optimize and may introduce slower memory access (e.g., in
-    // cv::cv_hal_rvv::fast_atan_64).
-    // Saving registers can also make this function more reusable in other contexts.
-    // Therefore, vfmadd is not used here.
-    auto a = __riscv_vfadd(__riscv_vfmul(c2, params.p7, vl), params.p5, vl);
-    a = __riscv_vfadd(__riscv_vfmul(c2, a, vl), params.p3, vl);
-    a = __riscv_vfadd(__riscv_vfmul(c2, a, vl), params.p1, vl);
-    a = __riscv_vfmul(a, c, vl);
-
-    a = __riscv_vfrsub_mu(__riscv_vmflt(ax, ay, vl), a, a, params.angle_90, vl);
-    a = __riscv_vfrsub_mu(__riscv_vmflt(vx, 0.F, vl), a, a, params.angle_90 * 2, vl);
-    a = __riscv_vfrsub_mu(__riscv_vmflt(vy, 0.F, vl), a, a, params.angle_90 * 4, vl);
-
-    return a;
-}
-
-}  // namespace detail
-
-inline int fast_atan_32(const float* y, const float* x, float* dst, size_t n, bool angle_in_deg)
-{
-    auto atan_params = angle_in_deg ? detail::atan_params_deg : detail::atan_params_rad;
-
-    for (size_t vl = 0; n > 0; n -= vl)
-    {
-        vl = __riscv_vsetvl_e32m4(n);
-
-        auto vy = __riscv_vle32_v_f32m4(y, vl);
-        auto vx = __riscv_vle32_v_f32m4(x, vl);
-
-        auto a = detail::rvv_atan(vy, vx, vl, atan_params);
-
-        __riscv_vse32(dst, a, vl);
-
-        x += vl;
-        y += vl;
-        dst += vl;
-    }
-
-    return CV_HAL_ERROR_OK;
-}
-
-inline int fast_atan_64(const double* y, const double* x, double* dst, size_t n, bool angle_in_deg)
-{
-    // this also uses float32 version, ref: mathfuncs_core.simd.hpp
-
-    auto atan_params = angle_in_deg ? detail::atan_params_deg : detail::atan_params_rad;
-
-    for (size_t vl = 0; n > 0; n -= vl)
-    {
-        vl = __riscv_vsetvl_e64m8(n);
-
-        auto vy = __riscv_vfncvt_f(__riscv_vle64_v_f64m8(y, vl), vl);
-        auto vx = __riscv_vfncvt_f(__riscv_vle64_v_f64m8(x, vl), vl);
-
-        auto a = detail::rvv_atan(vy, vx, vl, atan_params);
-
-        __riscv_vse64(dst, __riscv_vfwcvt_f(a, vl), vl);
-
-        x += vl;
-        y += vl;
-        dst += vl;
-    }
-
-    return CV_HAL_ERROR_OK;
-}
-
-}} // namespace cv::cv_hal_rvv
-
-#endif //OPENCV_HAL_RVV_ATAN_HPP_INCLUDED
--- a/hal/riscv-rvv/hal_rvv_1p0/common.hpp
+++ b/hal/riscv-rvv/hal_rvv_1p0/common.hpp
@ -1,52 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.
-//
-// Copyright (C) 2025, SpaceMIT Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-
-#ifndef OPENCV_HAL_RVV_COMMON_HPP_INCLUDED
-#define OPENCV_HAL_RVV_COMMON_HPP_INCLUDED
-
-#include <riscv_vector.h>
-
-namespace cv { namespace cv_hal_rvv { namespace custom_intrin {
-
-#define CV_HAL_RVV_NOOP(a) (a)
-
-#define CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(_Tpvs, _Tpvd, shift, suffix) \
-    inline _Tpvd __riscv_vabs(const _Tpvs& v, const int vl) { \
-        _Tpvs mask = __riscv_vsra(v, shift, vl); \
-        _Tpvs v_xor = __riscv_vxor(v, mask, vl); \
-        return __riscv_vreinterpret_##suffix( \
-            __riscv_vsub(v_xor, mask, vl) \
-        ); \
-    }
-
-CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint8m2_t,  vuint8m2_t,  7,  u8m2)
-CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint8m8_t,  vuint8m8_t,  7,  u8m8)
-CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint16m4_t, vuint16m4_t, 15, u16m4)
-CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint16m8_t, vuint16m8_t, 15, u16m8)
-CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint32m4_t, vuint32m4_t, 31, u32m4)
-CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint32m8_t, vuint32m8_t, 31, u32m8)
-
-#define CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(_Tpvs, _Tpvd, cast, sub, max, min) \
-    inline _Tpvd __riscv_vabd(const _Tpvs& v1, const _Tpvs& v2, const int vl) { \
-        return cast(__riscv_##sub(__riscv_##max(v1, v2, vl), __riscv_##min(v1, v2, vl), vl)); \
-    }
-
-CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vuint8m4_t, vuint8m4_t, CV_HAL_RVV_NOOP, vsub, vmaxu, vminu)
-CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vuint8m8_t, vuint8m8_t, CV_HAL_RVV_NOOP, vsub, vmaxu, vminu)
-CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vuint16m2_t, vuint16m2_t, CV_HAL_RVV_NOOP, vsub, vmaxu, vminu)
-CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vuint16m8_t, vuint16m8_t, CV_HAL_RVV_NOOP, vsub, vmaxu, vminu)
-
-CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint8m4_t, vuint8m4_t, __riscv_vreinterpret_u8m4, vsub, vmax, vmin)
-CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint8m8_t, vuint8m8_t, __riscv_vreinterpret_u8m8, vsub, vmax, vmin)
-CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint16m2_t, vuint16m2_t, __riscv_vreinterpret_u16m2, vsub, vmax, vmin)
-CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint16m8_t, vuint16m8_t, __riscv_vreinterpret_u16m8, vsub, vmax, vmin)
-CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint32m4_t, vuint32m4_t, __riscv_vreinterpret_u32m4, vsub, vmax, vmin)
-CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint32m8_t, vuint32m8_t, __riscv_vreinterpret_u32m8, vsub, vmax, vmin)
-
-}}} // cv::cv_hal_rvv::custom_intrin
-
-#endif
--- a/hal/riscv-rvv/hal_rvv_1p0/div.hpp
+++ b/hal/riscv-rvv/hal_rvv_1p0/div.hpp
@ -1,268 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.
-//
-// Copyright (C) 2025, SpaceMIT Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-
-#ifndef OPENCV_HAL_RVV_DIV_HPP_INCLUDED
-#define OPENCV_HAL_RVV_DIV_HPP_INCLUDED
-
-#include <riscv_vector.h>
-#include <limits>
-
-namespace cv { namespace cv_hal_rvv { namespace div {
-
-namespace {
-
-    inline size_t setvl(int l) { return __riscv_vsetvl_e8m2(l); }
-
-    inline   vuint8m2_t vle(const uint8_t  *p, int vl) { return __riscv_vle8_v_u8m2(p, vl); }
-    inline    vint8m2_t vle(const int8_t   *p, int vl) { return __riscv_vle8_v_i8m2(p, vl); }
-    inline  vuint16m4_t vle(const uint16_t *p, int vl) { return __riscv_vle16_v_u16m4(p, vl); }
-    inline   vint16m4_t vle(const int16_t  *p, int vl) { return __riscv_vle16_v_i16m4(p, vl); }
-    inline   vint32m8_t vle(const int      *p, int vl) { return __riscv_vle32_v_i32m8(p, vl); }
-    inline vfloat32m8_t vle(const float    *p, int vl) { return __riscv_vle32_v_f32m8(p, vl); }
-
-    inline void vse(uint8_t  *p, const   vuint8m2_t &v, int vl) { __riscv_vse8(p, v, vl); }
-    inline void vse(int8_t   *p, const    vint8m2_t &v, int vl) { __riscv_vse8(p, v, vl); }
-    inline void vse(uint16_t *p, const  vuint16m4_t &v, int vl) { __riscv_vse16(p, v, vl); }
-    inline void vse(int16_t  *p, const   vint16m4_t &v, int vl) { __riscv_vse16(p, v, vl); }
-    inline void vse(int      *p, const   vint32m8_t &v, int vl) { __riscv_vse32(p, v, vl); }
-    inline void vse(float    *p, const vfloat32m8_t &v, int vl) { __riscv_vse32(p, v, vl); }
-
-    inline vuint16m4_t ext(const  vuint8m2_t &v, const int vl) { return __riscv_vzext_vf2(v, vl); }
-    inline  vint16m4_t ext(const   vint8m2_t &v, const int vl) { return __riscv_vsext_vf2(v, vl); }
-    inline vuint32m8_t ext(const vuint16m4_t &v, const int vl) { return __riscv_vzext_vf2(v, vl); }
-    inline  vint32m8_t ext(const  vint16m4_t &v, const int vl) { return __riscv_vsext_vf2(v, vl); }
-
-    inline  vuint8m2_t nclip(const vuint16m4_t &v, const int vl) { return __riscv_vnclipu(v, 0, __RISCV_VXRM_RNU, vl); }
-    inline   vint8m2_t nclip(const  vint16m4_t &v, const int vl) { return __riscv_vnclip(v, 0, __RISCV_VXRM_RNU, vl); }
-    inline vuint16m4_t nclip(const vuint32m8_t &v, const int vl) { return __riscv_vnclipu(v, 0, __RISCV_VXRM_RNU, vl); }
-    inline  vint16m4_t nclip(const  vint32m8_t &v, const int vl) { return __riscv_vnclip(v, 0, __RISCV_VXRM_RNU, vl); }
-
-    template <typename VT> inline
-    VT div_sat(const VT &v1, const VT &v2, const float scale, const int vl) {
-        return nclip(div_sat(ext(v1, vl), ext(v2, vl), scale, vl), vl);
-    }
-    template <> inline
-    vint32m8_t div_sat(const vint32m8_t &v1, const vint32m8_t &v2, const float scale, const int vl) {
-        auto f1 = __riscv_vfcvt_f(v1, vl);
-        auto f2 = __riscv_vfcvt_f(v2, vl);
-        auto res = __riscv_vfmul(f1, __riscv_vfrdiv(f2, scale, vl), vl);
-        return __riscv_vfcvt_x(res, vl);
-    }
-    template <> inline
-    vuint32m8_t div_sat(const vuint32m8_t &v1, const vuint32m8_t &v2, const float scale, const int vl) {
-        auto f1 = __riscv_vfcvt_f(v1, vl);
-        auto f2 = __riscv_vfcvt_f(v2, vl);
-        auto res = __riscv_vfmul(f1, __riscv_vfrdiv(f2, scale, vl), vl);
-        return __riscv_vfcvt_xu(res, vl);
-    }
-
-    template <typename VT> inline
-    VT recip_sat(const VT &v, const float scale, const int vl) {
-        return nclip(recip_sat(ext(v, vl), scale, vl), vl);
-    }
-    template <> inline
-    vint32m8_t recip_sat(const vint32m8_t &v, const float scale, const int vl) {
-        auto f = __riscv_vfcvt_f(v, vl);
-        auto res = __riscv_vfrdiv(f, scale, vl);
-        return __riscv_vfcvt_x(res, vl);
-    }
-    template <> inline
-    vuint32m8_t recip_sat(const vuint32m8_t &v, const float scale, const int vl) {
-        auto f = __riscv_vfcvt_f(v, vl);
-        auto res = __riscv_vfrdiv(f, scale, vl);
-        return __riscv_vfcvt_xu(res, vl);
-    }
-
-} // anonymous
-
-#undef cv_hal_div8u
-#define cv_hal_div8u cv::cv_hal_rvv::div::div<uint8_t>
-#undef cv_hal_div8s
-#define cv_hal_div8s cv::cv_hal_rvv::div::div<int8_t>
-#undef cv_hal_div16u
-#define cv_hal_div16u cv::cv_hal_rvv::div::div<uint16_t>
-#undef cv_hal_div16s
-#define cv_hal_div16s cv::cv_hal_rvv::div::div<int16_t>
-#undef cv_hal_div32s
-#define cv_hal_div32s cv::cv_hal_rvv::div::div<int>
-#undef cv_hal_div32f
-#define cv_hal_div32f cv::cv_hal_rvv::div::div<float>
-// #undef cv_hal_div64f
-// #define cv_hal_div64f cv::cv_hal_rvv::div::div<double>
-
-template <typename ST> inline
-int div(const ST *src1, size_t step1, const ST *src2, size_t step2,
-         ST *dst, size_t step, int width, int height, float scale) {
-    if (scale == 0.f ||
-        (scale * static_cast<float>(std::numeric_limits<ST>::max())) <  1.f &&
-        (scale * static_cast<float>(std::numeric_limits<ST>::max())) > -1.f) {
-        for (int h = 0; h < height; h++) {
-            ST *dst_h = reinterpret_cast<ST*>((uchar*)dst + h * step);
-            std::memset(dst_h, 0, sizeof(ST) * width);
-        }
-        return CV_HAL_ERROR_OK;
-    }
-
-    for (int h = 0; h < height; h++) {
-        const ST *src1_h = reinterpret_cast<const ST*>((const uchar*)src1 + h * step1);
-        const ST *src2_h = reinterpret_cast<const ST*>((const uchar*)src2 + h * step2);
-        ST *dst_h = reinterpret_cast<ST*>((uchar*)dst + h * step);
-
-        int vl;
-        for (int w = 0; w < width; w += vl) {
-            vl = setvl(width - w);
-
-            auto v1 = vle(src1_h + w, vl);
-            auto v2 = vle(src2_h + w, vl);
-
-            auto mask = __riscv_vmseq(v2, 0, vl);
-            vse(dst_h + w, __riscv_vmerge(div_sat(v1, v2, scale, vl), 0, mask, vl), vl);
-        }
-    }
-
-    return CV_HAL_ERROR_OK;
-}
-
-template <> inline
-int div(const float *src1, size_t step1, const float *src2, size_t step2,
-        float *dst, size_t step, int width, int height, float scale) {
-    if (scale == 0.f) {
-        for (int h = 0; h < height; h++) {
-            float *dst_h = reinterpret_cast<float*>((uchar*)dst + h * step);
-            std::memset(dst_h, 0, sizeof(float) * width);
-        }
-        return CV_HAL_ERROR_OK;
-    }
-
-    if (std::fabs(scale - 1.f) < FLT_EPSILON) {
-        for (int h = 0; h < height; h++) {
-            const float *src1_h = reinterpret_cast<const float*>((const uchar*)src1 + h * step1);
-            const float *src2_h = reinterpret_cast<const float*>((const uchar*)src2 + h * step2);
-            float *dst_h = reinterpret_cast<float*>((uchar*)dst + h * step);
-
-            int vl;
-            for (int w = 0; w < width; w += vl) {
-                vl = setvl(width - w);
-
-                auto v1 = vle(src1_h + w, vl);
-                auto v2 = vle(src2_h + w, vl);
-
-                vse(dst_h + w, __riscv_vfmul(v1, __riscv_vfrdiv(v2, 1.f, vl), vl), vl);
-            }
-        }
-    } else {
-        for (int h = 0; h < height; h++) {
-            const float *src1_h = reinterpret_cast<const float*>((const uchar*)src1 + h * step1);
-            const float *src2_h = reinterpret_cast<const float*>((const uchar*)src2 + h * step2);
-            float *dst_h = reinterpret_cast<float*>((uchar*)dst + h * step);
-
-            int vl;
-            for (int w = 0; w < width; w += vl) {
-                vl = setvl(width - w);
-
-                auto v1 = vle(src1_h + w, vl);
-                auto v2 = vle(src2_h + w, vl);
-
-                vse(dst_h + w, __riscv_vfmul(v1, __riscv_vfrdiv(v2, scale, vl), vl), vl);
-            }
-        }
-    }
-
-    return CV_HAL_ERROR_OK;
-}
-
-#undef cv_hal_recip8u
-#define cv_hal_recip8u cv::cv_hal_rvv::div::recip<uint8_t>
-#undef cv_hal_recip8s
-#define cv_hal_recip8s cv::cv_hal_rvv::div::recip<int8_t>
-#undef cv_hal_recip16u
-#define cv_hal_recip16u cv::cv_hal_rvv::div::recip<uint16_t>
-#undef cv_hal_recip16s
-#define cv_hal_recip16s cv::cv_hal_rvv::div::recip<int16_t>
-#undef cv_hal_recip32s
-#define cv_hal_recip32s cv::cv_hal_rvv::div::recip<int>
-#undef cv_hal_recip32f
-#define cv_hal_recip32f cv::cv_hal_rvv::div::recip<float>
-// #undef cv_hal_recip64f
-// #define cv_hal_recip64f cv::cv_hal_rvv::div::recip<double>
-
-template <typename ST> inline
-int recip(const ST *src_data, size_t src_step, ST *dst_data, size_t dst_step,
-          int width, int height, float scale) {
-    if (scale == 0.f || scale < 1.f && scale > -1.f) {
-        for (int h = 0; h < height; h++) {
-            ST *dst_h = reinterpret_cast<ST*>((uchar*)dst_data + h * dst_step);
-            std::memset(dst_h, 0, sizeof(ST) * width);
-        }
-        return CV_HAL_ERROR_OK;
-    }
-
-    for (int h = 0; h < height; h++) {
-        const ST *src_h = reinterpret_cast<const ST*>((const uchar*)src_data + h * src_step);
-        ST *dst_h = reinterpret_cast<ST*>((uchar*)dst_data + h * dst_step);
-
-        int vl;
-        for (int w = 0; w < width; w += vl) {
-            vl = setvl(width - w);
-
-            auto v = vle(src_h + w, vl);
-
-            auto mask = __riscv_vmseq(v, 0, vl);
-            vse(dst_h + w, __riscv_vmerge(recip_sat(v, scale, vl), 0, mask, vl), vl);
-        }
-    }
-
-    return CV_HAL_ERROR_OK;
-}
-
-template <> inline
-int recip(const float *src_data, size_t src_step, float *dst_data, size_t dst_step,
-          int width, int height, float scale) {
-    if (scale == 0.f) {
-        for (int h = 0; h < height; h++) {
-            float *dst_h = reinterpret_cast<float*>((uchar*)dst_data + h * dst_step);
-            std::memset(dst_h, 0, sizeof(float) * width);
-        }
-        return CV_HAL_ERROR_OK;
-    }
-
-    if (std::fabs(scale - 1.f) < FLT_EPSILON) {
-        for (int h = 0; h < height; h++) {
-            const float *src_h = reinterpret_cast<const float*>((const uchar*)src_data + h * src_step);
-            float *dst_h = reinterpret_cast<float*>((uchar*)dst_data + h * dst_step);
-
-            int vl;
-            for (int w = 0; w < width; w += vl) {
-                vl = setvl(width - w);
-
-                auto v = vle(src_h + w, vl);
-
-                vse(dst_h + w, __riscv_vfrdiv(v, 1.f, vl), vl);
-            }
-        }
-    } else {
-        for (int h = 0; h < height; h++) {
-            const float *src_h = reinterpret_cast<const float*>((const uchar*)src_data + h * src_step);
-            float *dst_h = reinterpret_cast<float*>((uchar*)dst_data + h * dst_step);
-
-            int vl;
-            for (int w = 0; w < width; w += vl) {
-                vl = setvl(width - w);
-
-                auto v = vle(src_h + w, vl);
-
-                vse(dst_h + w, __riscv_vfrdiv(v, scale, vl), vl);
-            }
-        }
-    }
-
-    return CV_HAL_ERROR_OK;
-}
-
-}}} // cv::cv_hal_rvv::div
-
-#endif // OPENCV_HAL_RVV_DIV_HPP_INCLUDED
--- a/hal/riscv-rvv/hal_rvv_1p0/filter.hpp
+++ b/hal/riscv-rvv/hal_rvv_1p0/filter.hpp
--- a/hal/riscv-rvv/hal_rvv_1p0/histogram.hpp
+++ b/hal/riscv-rvv/hal_rvv_1p0/histogram.hpp
@ -1,108 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.
-
-// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
-
-#ifndef OPENCV_HAL_RVV_HISTOGRAM_HPP_INCLUDED
-#define OPENCV_HAL_RVV_HISTOGRAM_HPP_INCLUDED
-
-#include <riscv_vector.h>
-
-namespace cv { namespace cv_hal_rvv {
-
-namespace equalize_hist {
-#undef cv_hal_equalize_hist
-#define cv_hal_equalize_hist cv::cv_hal_rvv::equalize_hist::equalize_hist
-
-class HistogramInvoker : public ParallelLoopBody
-{
-public:
-    template<typename... Args>
-    HistogramInvoker(std::function<void(int, int, Args...)> _func, Args&&... args)
-    {
-        func = std::bind(_func, std::placeholders::_1, std::placeholders::_2, std::forward<Args>(args)...);
-    }
-
-    virtual void operator()(const Range& range) const override
-    {
-        func(range.start, range.end);
-    }
-
-private:
-    std::function<void(int, int)> func;
-};
-
-constexpr int HIST_SZ = std::numeric_limits<uchar>::max() + 1;
-
-static inline void hist_invoke(int start, int end, const uchar* src_data, size_t src_step, int width, int* hist, std::mutex* m)
-{
-    int h[HIST_SZ] = {0};
-    for (int i = start; i < end; i++)
-    {
-        const uchar* src = src_data + i * src_step;
-        int j;
-        for (j = 0; j + 3 < width; j += 4)
-        {
-            int t0 = src[j], t1 = src[j+1];
-            h[t0]++; h[t1]++;
-            t0 = src[j+2]; t1 = src[j+3];
-            h[t0]++; h[t1]++;
-        }
-        for (; j < width; j++)
-        {
-            h[src[j]]++;
-        }
-    }
-
-    std::lock_guard<std::mutex> lk(*m);
-    for (int i = 0; i < HIST_SZ; i++)
-    {
-        hist[i] += h[i];
-    }
-}
-
-static inline void lut_invoke(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, const uchar* lut)
-{
-    for (int i = start; i < end; i++)
-    {
-        int vl;
-        for (int j = 0; j < width; j += vl)
-        {
-            vl = __riscv_vsetvl_e8m8(width - j);
-            auto src = __riscv_vle8_v_u8m8(src_data + i * src_step + j, vl);
-            auto dst = __riscv_vloxei8_v_u8m8(lut, src, vl);
-            __riscv_vse8(dst_data + i * dst_step + j, dst, vl);
-        }
-    }
-}
-
-// the algorithm is copied from imgproc/src/histogram.cpp,
-// in the function void cv::equalizeHist
-inline int equalize_hist(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height)
-{
-    int hist[HIST_SZ] = {0};
-    uchar lut[HIST_SZ];
-
-    std::mutex m;
-    cv::parallel_for_(Range(0, height), HistogramInvoker({hist_invoke}, src_data, src_step, width, reinterpret_cast<int *>(hist), &m), static_cast<double>(width * height) / (1 << 15));
-
-    int i = 0;
-    while (!hist[i]) ++i;
-
-    float scale = (HIST_SZ - 1.f)/(width * height - hist[i]);
-    int sum = 0;
-    for (lut[i++] = 0; i < HIST_SZ; i++)
-    {
-        sum += hist[i];
-        lut[i] = std::min(std::max(static_cast<int>(std::round(sum * scale)), 0), HIST_SZ - 1);
-    }
-    cv::parallel_for_(Range(0, height), HistogramInvoker({lut_invoke}, src_data, src_step, dst_data, dst_step, width, reinterpret_cast<const uchar*>(lut)), static_cast<double>(width * height) / (1 << 15));
-
-    return CV_HAL_ERROR_OK;
-}
-} // cv::cv_hal_rvv::equalize_hist
-
-}}
-
-#endif
--- a/hal/riscv-rvv/hal_rvv_1p0/polar_to_cart.hpp
+++ b/hal/riscv-rvv/hal_rvv_1p0/polar_to_cart.hpp
@ -1,53 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.
-
-// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
-
-#ifndef OPENCV_HAL_RVV_POLAR_TO_CART_HPP_INCLUDED
-#define OPENCV_HAL_RVV_POLAR_TO_CART_HPP_INCLUDED
-
-#include <riscv_vector.h>
-#include "hal_rvv_1p0/sincos.hpp"
-#include "hal_rvv_1p0/types.hpp"
-
-namespace cv { namespace cv_hal_rvv {
-
-#undef cv_hal_polarToCart32f
-#define cv_hal_polarToCart32f cv::cv_hal_rvv::polarToCart<cv::cv_hal_rvv::RVV_F32M4>
-#undef cv_hal_polarToCart64f
-#define cv_hal_polarToCart64f cv::cv_hal_rvv::polarToCart<cv::cv_hal_rvv::RVV_F64M8>
-
-template <typename RVV_T, typename Elem = typename RVV_T::ElemType>
-inline int
-    polarToCart(const Elem* mag, const Elem* angle, Elem* x, Elem* y, int len, bool angleInDegrees)
-{
-    using T = RVV_F32M4;
-    const auto sincos_scale = angleInDegrees ? detail::sincos_deg_scale : detail::sincos_rad_scale;
-
-    size_t vl;
-    auto cos_p2 = T::vmv(detail::sincos_cos_p2, T::setvlmax());
-    auto cos_p0 = T::vmv(detail::sincos_cos_p0, T::setvlmax());
-    for (; len > 0; len -= (int)vl, angle += vl, x += vl, y += vl)
-    {
-        vl = RVV_T::setvl(len);
-        auto vangle = T::cast(RVV_T::vload(angle, vl), vl);
-        T::VecType vsin, vcos;
-        detail::SinCos32f<T>(vangle, vsin, vcos, sincos_scale, cos_p2, cos_p0, vl);
-        if (mag)
-        {
-            auto vmag = T::cast(RVV_T::vload(mag, vl), vl);
-            vsin = __riscv_vfmul(vsin, vmag, vl);
-            vcos = __riscv_vfmul(vcos, vmag, vl);
-            mag += vl;
-        }
-        RVV_T::vstore(x, RVV_T::cast(vcos, vl), vl);
-        RVV_T::vstore(y, RVV_T::cast(vsin, vl), vl);
-    }
-
-    return CV_HAL_ERROR_OK;
-}
-
-}}  // namespace cv::cv_hal_rvv
-
-#endif  // OPENCV_HAL_RVV_POLAR_TO_CART_HPP_INCLUDED
--- a/hal/riscv-rvv/hal_rvv_1p0/sqrt.hpp
+++ b/hal/riscv-rvv/hal_rvv_1p0/sqrt.hpp
@ -1,131 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level
-// directory of this distribution and at http://opencv.org/license.html.
-
-// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
-
-#ifndef OPENCV_HAL_RVV_SQRT_HPP_INCLUDED
-#define OPENCV_HAL_RVV_SQRT_HPP_INCLUDED
-
-#include <riscv_vector.h>
-#include <cmath>
-#include "hal_rvv_1p0/types.hpp"
-
-namespace cv { namespace cv_hal_rvv {
-
-#undef cv_hal_sqrt32f
-#undef cv_hal_sqrt64f
-#undef cv_hal_invSqrt32f
-#undef cv_hal_invSqrt64f
-
-#define cv_hal_sqrt32f cv::cv_hal_rvv::sqrt<cv::cv_hal_rvv::Sqrt32f<cv::cv_hal_rvv::RVV_F32M8>>
-#define cv_hal_sqrt64f cv::cv_hal_rvv::sqrt<cv::cv_hal_rvv::Sqrt64f<cv::cv_hal_rvv::RVV_F64M8>>
-
-#ifdef __clang__
-// Strange bug in clang: invSqrt use 2 LMUL registers to store mask, which will cause memory access.
-// So a smaller LMUL is used here.
-#    define cv_hal_invSqrt32f cv::cv_hal_rvv::invSqrt<cv::cv_hal_rvv::Sqrt32f<cv::cv_hal_rvv::RVV_F32M4>>
-#    define cv_hal_invSqrt64f cv::cv_hal_rvv::invSqrt<cv::cv_hal_rvv::Sqrt64f<cv::cv_hal_rvv::RVV_F64M4>>
-#else
-#    define cv_hal_invSqrt32f cv::cv_hal_rvv::invSqrt<cv::cv_hal_rvv::Sqrt32f<cv::cv_hal_rvv::RVV_F32M8>>
-#    define cv_hal_invSqrt64f cv::cv_hal_rvv::invSqrt<cv::cv_hal_rvv::Sqrt64f<cv::cv_hal_rvv::RVV_F64M8>>
-#endif
-
-namespace detail {
-
-// Newton-Raphson method
-// Use 4 LMUL registers
-template <size_t iter_times, typename VEC_T>
-inline VEC_T sqrt(VEC_T x, size_t vl)
-{
-    auto x2 = __riscv_vfmul(x, 0.5, vl);
-    auto y = __riscv_vfrsqrt7(x, vl);
-#ifdef __clang__
-#pragma unroll
-#endif
-    for (size_t i = 0; i < iter_times; i++)
-    {
-        auto t = __riscv_vfmul(y, y, vl);
-        t = __riscv_vfmul(t, x2, vl);
-        t = __riscv_vfrsub(t, 1.5, vl);
-        y = __riscv_vfmul(t, y, vl);
-    }
-    // just to prevent the compiler from calculating mask before the iteration, which will run out
-    // of registers and cause memory access.
-    asm volatile("" ::: "memory");
-    auto classified = __riscv_vfclass(x, vl);
-    // block -0, +0, positive subnormal number, +inf
-    auto mask = __riscv_vmseq(__riscv_vand(classified, 0b10111000, vl), 0, vl);
-    return __riscv_vfmul_mu(mask, x, x, y, vl);
-}
-
-// Newton-Raphson method
-// Use 3 LMUL registers and 1 mask register
-template <size_t iter_times, typename VEC_T>
-inline VEC_T invSqrt(VEC_T x, size_t vl)
-{
-    auto classified = __riscv_vfclass(x, vl);
-    // block -0, +0, positive subnormal number, +inf
-    auto mask = __riscv_vmseq(__riscv_vand(classified, 0b10111000, vl), 0, vl);
-    auto x2 = __riscv_vfmul(x, 0.5, vl);
-    auto y = __riscv_vfrsqrt7(x, vl);
-#ifdef __clang__
-#pragma unroll
-#endif
-    for (size_t i = 0; i < iter_times; i++)
-    {
-        auto t = __riscv_vfmul(y, y, vl);
-        t = __riscv_vfmul(t, x2, vl);
-        t = __riscv_vfrsub(t, 1.5, vl);
-        y = __riscv_vfmul_mu(mask, y, t, y, vl);
-    }
-    return y;
-}
-
-}  // namespace detail
-
-template <typename RVV_T>
-struct Sqrt32f
-{
-    using T = RVV_T;
-    static constexpr size_t iter_times = 2;
-};
-
-template <typename RVV_T>
-struct Sqrt64f
-{
-    using T = RVV_T;
-    static constexpr size_t iter_times = 3;
-};
-
-template <typename SQRT_T, typename Elem = typename SQRT_T::T::ElemType>
-inline int sqrt(const Elem* src, Elem* dst, int _len)
-{
-    size_t vl;
-    for (size_t len = _len; len > 0; len -= vl, src += vl, dst += vl)
-    {
-        vl = SQRT_T::T::setvl(len);
-        auto x = SQRT_T::T::vload(src, vl);
-        SQRT_T::T::vstore(dst, detail::sqrt<SQRT_T::iter_times>(x, vl), vl);
-    }
-
-    return CV_HAL_ERROR_OK;
-}
-
-template <typename SQRT_T, typename Elem = typename SQRT_T::T::ElemType>
-inline int invSqrt(const Elem* src, Elem* dst, int _len)
-{
-    size_t vl;
-    for (size_t len = _len; len > 0; len -= vl, src += vl, dst += vl)
-    {
-        vl = SQRT_T::T::setvl(len);
-        auto x = SQRT_T::T::vload(src, vl);
-        SQRT_T::T::vstore(dst, detail::invSqrt<SQRT_T::iter_times>(x, vl), vl);
-    }
-
-    return CV_HAL_ERROR_OK;
-}
-
-}}  // namespace cv::cv_hal_rvv
-
-#endif  // OPENCV_HAL_RVV_SQRT_HPP_INCLUDED
--- a/hal/riscv-rvv/include/core.hpp
+++ b/hal/riscv-rvv/include/core.hpp
@ -0,0 +1,332 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_RVV_HAL_CORE_HPP
+#define OPENCV_RVV_HAL_CORE_HPP
+
+namespace cv { namespace rvv_hal { namespace core {
+
+#if CV_HAL_RVV_1P0_ENABLED
+
+/* ############ merge ############ */
+
+int merge8u(const uchar** src, uchar* dst, int len, int cn);
+int merge16u(const ushort** src, ushort* dst, int len, int cn);
+int merge32s(const int** src, int* dst, int len, int cn);
+int merge64s(const int64** src, int64* dst, int len, int cn);
+
+#undef cv_hal_merge8u
+#define cv_hal_merge8u cv::rvv_hal::core::merge8u
+#undef cv_hal_merge16u
+#define cv_hal_merge16u cv::rvv_hal::core::merge16u
+#undef cv_hal_merge32s
+#define cv_hal_merge32s cv::rvv_hal::core::merge32s
+#undef cv_hal_merge64s
+#define cv_hal_merge64s cv::rvv_hal::core::merge64s
+
+/* ############ meanStdDev ############ */
+
+int meanStdDev(const uchar* src_data, size_t src_step, int width, int height, int src_type,
+               double* mean_val, double* stddev_val, uchar* mask, size_t mask_step);
+
+#undef cv_hal_meanStdDev
+#define cv_hal_meanStdDev cv::rvv_hal::core::meanStdDev
+
+/* ############ dft ############ */
+
+int dft(const uchar* src, uchar* dst, int depth, int nf, int *factors, double scale,
+        int* itab, void* wave, int tab_size, int n, bool isInverse, bool noPermute);
+
+#undef cv_hal_dft
+#define cv_hal_dft cv::rvv_hal::core::dft
+
+/* ############ norm ############ */
+
+int norm(const uchar* src, size_t src_step, const uchar* mask, size_t mask_step,
+         int width, int height, int type, int norm_type, double* result);
+
+#undef cv_hal_norm
+#define cv_hal_norm cv::rvv_hal::core::norm
+
+/* ############ normDiff ############ */
+
+int normDiff(const uchar* src1, size_t src1_step, const uchar* src2, size_t src2_step,
+             const uchar* mask, size_t mask_step, int width, int height, int type,
+             int norm_type, double* result);
+
+#undef cv_hal_normDiff
+#define cv_hal_normDiff cv::rvv_hal::core::normDiff
+
+/* ############ normHamming ############ */
+
+int normHamming8u(const uchar* a, int n, int cellSize, int* result);
+int normHammingDiff8u(const uchar* a, const uchar* b, int n, int cellSize, int* result);
+
+#undef cv_hal_normHamming8u
+#define cv_hal_normHamming8u cv::rvv_hal::core::normHamming8u
+#undef cv_hal_normHammingDiff8u
+#define cv_hal_normHammingDiff8u cv::rvv_hal::core::normHammingDiff8u
+
+/* ############ convertScale ############ */
+
+int convertScale(const uchar* src, size_t src_step, uchar* dst, size_t dst_step,
+                 int width, int height, int sdepth, int ddepth, double alpha, double beta);
+
+#undef cv_hal_convertScale
+#define cv_hal_convertScale cv::rvv_hal::core::convertScale
+
+/* ############ minMaxIdx ############ */
+
+int minMaxIdx(const uchar* src_data, size_t src_step, int width, int height, int depth,
+              double* minVal, double* maxVal, int* minIdx, int* maxIdx, uchar* mask, size_t mask_step = 0);
+
+#undef cv_hal_minMaxIdx
+#define cv_hal_minMaxIdx cv::rvv_hal::core::minMaxIdx
+#undef cv_hal_minMaxIdxMaskStep
+#define cv_hal_minMaxIdxMaskStep cv::rvv_hal::core::minMaxIdx
+
+/* ############ fastAtan ############ */
+
+int fast_atan_32(const float* y, const float* x, float* dst, size_t n, bool angle_in_deg);
+int fast_atan_64(const double* y, const double* x, double* dst, size_t n, bool angle_in_deg);
+
+#undef cv_hal_fastAtan32f
+#define cv_hal_fastAtan32f cv::rvv_hal::core::fast_atan_32
+#undef cv_hal_fastAtan64f
+#define cv_hal_fastAtan64f cv::rvv_hal::core::fast_atan_64
+
+/* ############ split ############ */
+
+int split8u(const uchar* src, uchar** dst, int len, int cn);
+
+#undef cv_hal_split8u
+#define cv_hal_split8u cv::rvv_hal::core::split8u
+
+/* ############ sqrt ############ */
+
+int sqrt32f(const float* src, float* dst, int _len);
+int sqrt64f(const double* src, double* dst, int _len);
+
+#undef cv_hal_sqrt32f
+#define cv_hal_sqrt32f cv::rvv_hal::core::sqrt32f
+#undef cv_hal_sqrt64f
+#define cv_hal_sqrt64f cv::rvv_hal::core::sqrt64f
+
+int invSqrt32f(const float* src, float* dst, int _len);
+int invSqrt64f(const double* src, double* dst, int _len);
+
+#undef cv_hal_invSqrt32f
+#define cv_hal_invSqrt32f cv::rvv_hal::core::invSqrt32f
+#undef cv_hal_invSqrt64f
+#define cv_hal_invSqrt64f cv::rvv_hal::core::invSqrt64f
+
+/* ############ magnitude ############ */
+
+int magnitude32f(const float *x, const float *y, float *dst, int len);
+int magnitude64f(const double *x, const double  *y, double *dst, int len);
+
+#undef cv_hal_magnitude32f
+#define cv_hal_magnitude32f cv::rvv_hal::core::magnitude32f
+#undef cv_hal_magnitude64f
+#define cv_hal_magnitude64f cv::rvv_hal::core::magnitude64f
+
+/* ############ cartToPolar ############ */
+
+int cartToPolar32f(const float* x, const float* y, float* mag, float* angle, int len, bool angleInDegrees);
+int cartToPolar64f(const double* x, const double* y, double* mag, double* angle, int len, bool angleInDegrees);
+
+#undef cv_hal_cartToPolar32f
+#define cv_hal_cartToPolar32f cv::rvv_hal::core::cartToPolar32f
+#undef cv_hal_cartToPolar64f
+#define cv_hal_cartToPolar64f cv::rvv_hal::core::cartToPolar64f
+
+/* ############ polarToCart ############ */
+
+int polarToCart32f(const float* mag, const float* angle, float* x, float* y, int len, bool angleInDegrees);
+int polarToCart64f(const double* mag, const double* angle, double* x, double* y, int len, bool angleInDegrees);
+
+#undef cv_hal_polarToCart32f
+#define cv_hal_polarToCart32f cv::rvv_hal::core::polarToCart32f
+#undef cv_hal_polarToCart64f
+#define cv_hal_polarToCart64f cv::rvv_hal::core::polarToCart64f
+
+/* ############ polarToCart ############ */
+
+int flip(int src_type, const uchar* src_data, size_t src_step, int src_width, int src_height,
+         uchar* dst_data, size_t dst_step, int flip_mode);
+
+#undef cv_hal_flip
+#define cv_hal_flip cv::rvv_hal::core::flip
+
+/* ############ lut ############ */
+
+int lut(const uchar* src_data, size_t src_step, size_t src_type,
+        const uchar* lut_data, size_t lut_channel_size, size_t lut_channels,
+        uchar* dst_data, size_t dst_step, int width, int height);
+
+#undef cv_hal_lut
+#define cv_hal_lut cv::rvv_hal::core::lut
+
+/* ############ exp ############ */
+
+int exp32f(const float* src, float* dst, int _len);
+int exp64f(const double* src, double* dst, int _len);
+
+#undef cv_hal_exp32f
+#define cv_hal_exp32f cv::rvv_hal::core::exp32f
+#undef cv_hal_exp64f
+#define cv_hal_exp64f cv::rvv_hal::core::exp64f
+
+/* ############ log ############ */
+
+int log32f(const float* src, float* dst, int _len);
+int log64f(const double* src, double* dst, int _len);
+
+#undef cv_hal_log32f
+#define cv_hal_log32f cv::rvv_hal::core::log32f
+#undef cv_hal_log64f
+#define cv_hal_log64f cv::rvv_hal::core::log64f
+
+/* ############ lu ############ */
+
+int LU32f(float* src1, size_t src1_step, int m, float* src2, size_t src2_step, int n, int* info);
+int LU64f(double* src1, size_t src1_step, int m, double* src2, size_t src2_step, int n, int* info);
+
+#undef cv_hal_LU32f
+#define cv_hal_LU32f cv::rvv_hal::core::LU32f
+#undef cv_hal_LU64f
+#define cv_hal_LU64f cv::rvv_hal::core::LU64f
+
+/* ############ cholesky ############ */
+
+int Cholesky32f(float* src1, size_t src1_step, int m, float* src2, size_t src2_step, int n, bool* info);
+int Cholesky64f(double* src1, size_t src1_step, int m, double* src2, size_t src2_step, int n, bool* info);
+
+#undef cv_hal_Cholesky32f
+#define cv_hal_Cholesky32f cv::rvv_hal::core::Cholesky32f
+#undef cv_hal_Cholesky64f
+#define cv_hal_Cholesky64f cv::rvv_hal::core::Cholesky64f
+
+/* ############ qr ############ */
+
+int QR32f(float* src1, size_t src1_step, int m, int n, int k, float* src2, size_t src2_step, float* dst, int* info);
+int QR64f(double* src1, size_t src1_step, int m, int n, int k, double* src2, size_t src2_step, double* dst, int* info);
+
+#undef cv_hal_QR32f
+#define cv_hal_QR32f cv::rvv_hal::core::QR32f
+#undef cv_hal_QR64f
+#define cv_hal_QR64f cv::rvv_hal::core::QR64f
+
+/* ############ SVD ############ */
+
+int SVD32f(float* src, size_t src_step, float* w, float* u, size_t u_step, float* vt, size_t vt_step, int m, int n, int flags);
+int SVD64f(double* src, size_t src_step, double* w, double* u, size_t u_step, double* vt, size_t vt_step, int m, int n, int flags);
+
+#undef cv_hal_SVD32f
+#define cv_hal_SVD32f cv::rvv_hal::core::SVD32f
+#undef cv_hal_SVD64f
+#define cv_hal_SVD64f cv::rvv_hal::core::SVD64f
+
+/* ############ copyToMasked ############ */
+
+int copyToMasked(const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step, int width, int height,
+                 int type, const uchar *mask_data, size_t mask_step, int mask_type);
+
+#undef cv_hal_copyToMasked
+#define cv_hal_copyToMasked cv::rvv_hal::core::copyToMasked
+
+/* ############ div, recip ############ */
+
+int div8u(const uchar *src1_data, size_t src1_step, const uchar *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, double scale);
+int div8s(const schar *src1_data, size_t src1_step, const schar *src2_data, size_t src2_step, schar *dst_data, size_t dst_step, int width, int height, double scale);
+int div16u(const ushort *src1_data, size_t src1_step, const ushort *src2_data, size_t src2_step, ushort *dst_data, size_t dst_step, int width, int height, double scale);
+int div16s(const short *src1_data, size_t src1_step, const short *src2_data, size_t src2_step, short *dst_data, size_t dst_step, int width, int height, double scale);
+int div32s(const int *src1_data, size_t src1_step, const int *src2_data, size_t src2_step, int *dst_data, size_t dst_step, int width, int height, double scale);
+int div32f(const float *src1_data, size_t src1_step, const float *src2_data, size_t src2_step, float *dst_data, size_t dst_step, int width, int height, double scale);
+// int div64f(const double *src1_data, size_t src1_step, const double *src2_data, size_t src2_step, double *dst_data, size_t dst_step, int width, int height, double scale);
+
+#undef cv_hal_div8u
+#define cv_hal_div8u cv::rvv_hal::core::div8u
+#undef cv_hal_div8s
+#define cv_hal_div8s cv::rvv_hal::core::div8s
+#undef cv_hal_div16u
+#define cv_hal_div16u cv::rvv_hal::core::div16u
+#undef cv_hal_div16s
+#define cv_hal_div16s cv::rvv_hal::core::div16s
+#undef cv_hal_div32s
+#define cv_hal_div32s cv::rvv_hal::core::div32s
+#undef cv_hal_div32f
+#define cv_hal_div32f cv::rvv_hal::core::div32f
+// #undef cv_hal_div64f
+// #define cv_hal_div64f cv::rvv_hal::core::div64f
+
+int recip8u(const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step, int width, int height, double scale);
+int recip8s(const schar *src_data, size_t src_step, schar *dst_data, size_t dst_step, int width, int height, double scale);
+int recip16u(const ushort *src_data, size_t src_step, ushort *dst_data, size_t dst_step, int width, int height, double scale);
+int recip16s(const short *src_data, size_t src_step, short *dst_data, size_t dst_step, int width, int height, double scale);
+int recip32s(const int *src_data, size_t src_step, int *dst_data, size_t dst_step, int width, int height, double scale);
+int recip32f(const float *src_data, size_t src_step, float *dst_data, size_t dst_step, int width, int height, double scale);
+// int recip64f(const double *src_data, size_t src_step, double *dst_data, size_t dst_step, int width, int height, double scale);
+
+#undef cv_hal_recip8u
+#define cv_hal_recip8u cv::rvv_hal::core::recip8u
+#undef cv_hal_recip8s
+#define cv_hal_recip8s cv::rvv_hal::core::recip8s
+#undef cv_hal_recip16u
+#define cv_hal_recip16u cv::rvv_hal::core::recip16u
+#undef cv_hal_recip16s
+#define cv_hal_recip16s cv::rvv_hal::core::recip16s
+#undef cv_hal_recip32s
+#define cv_hal_recip32s cv::rvv_hal::core::recip32s
+#undef cv_hal_recip32f
+#define cv_hal_recip32f cv::rvv_hal::core::recip32f
+// #undef cv_hal_recip64f
+// #define cv_hal_recip64f cv::rvv_hal::core::recip64f
+
+/* ############ dotProduct ############ */
+
+int dotprod(const uchar *a_data, size_t a_step, const uchar *b_data, size_t b_step,
+            int width, int height, int type, double *dot_val);
+
+#undef cv_hal_dotProduct
+#define cv_hal_dotProduct cv::rvv_hal::core::dotprod
+
+/* ############ compare ############ */
+
+int cmp8u(const uchar *src1_data, size_t src1_step, const uchar *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, int operation);
+int cmp8s(const schar *src1_data, size_t src1_step, const schar *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, int operation);
+int cmp16u(const ushort *src1_data, size_t src1_step, const ushort *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, int operation);
+int cmp16s(const short *src1_data, size_t src1_step, const short *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, int operation);
+int cmp32s(const int *src1_data, size_t src1_step, const int *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, int operation);
+int cmp32f(const float *src1_data, size_t src1_step, const float *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, int operation);
+// int cmp64f(const double *src1_data, size_t src1_step, const double *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, int operation);
+
+#undef cv_hal_cmp8u
+#define cv_hal_cmp8u cv::rvv_hal::core::cmp8u
+#undef cv_hal_cmp8s
+#define cv_hal_cmp8s cv::rvv_hal::core::cmp8s
+#undef cv_hal_cmp16u
+#define cv_hal_cmp16u cv::rvv_hal::core::cmp16u
+#undef cv_hal_cmp16s
+#define cv_hal_cmp16s cv::rvv_hal::core::cmp16s
+#undef cv_hal_cmp32s
+#define cv_hal_cmp32s cv::rvv_hal::core::cmp32s
+#undef cv_hal_cmp32f
+#define cv_hal_cmp32f cv::rvv_hal::core::cmp32f
+// #undef cv_hal_cmp64f
+// #define cv_hal_cmp64f cv::rvv_hal::core::cmp64f
+
+/* ############ transpose2d ############ */
+
+int transpose2d(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step,
+                int src_width, int src_height, int element_size);
+
+#undef cv_hal_transpose2d
+#define cv_hal_transpose2d cv::rvv_hal::core::transpose2d
+
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+}}} // cv::rvv_hal::core
+
+#endif // OPENCV_RVV_HAL_CORE_HPP
--- a/hal/riscv-rvv/include/imgproc.hpp
+++ b/hal/riscv-rvv/include/imgproc.hpp
@ -0,0 +1,256 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_RVV_HAL_IMGPROC_HPP
+#define OPENCV_RVV_HAL_IMGPROC_HPP
+
+struct cvhalFilter2D;
+
+namespace cv { namespace rvv_hal { namespace imgproc {
+
+#if CV_HAL_RVV_1P0_ENABLED
+
+/* ############ imageMoments ############ */
+
+int imageMoments(const uchar* src_data, size_t src_step, int src_type,
+                 int width, int height, bool binary, double m[10]);
+
+#undef cv_hal_imageMoments
+#define cv_hal_imageMoments cv::rvv_hal::imgproc::imageMoments
+
+/* ############ filter ############ */
+
+int filterInit(cvhalFilter2D** context, uchar* kernel_data, size_t kernel_step, int kernel_type, int kernel_width, int kernel_height, int /*max_width*/, int /*max_height*/, int src_type, int dst_type, int borderType, double delta, int anchor_x, int anchor_y, bool /*allowSubmatrix*/, bool /*allowInplace*/);
+int filter(cvhalFilter2D* context, uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int full_width, int full_height, int offset_x, int offset_y);
+int filterFree(cvhalFilter2D* context);
+
+#undef cv_hal_filterInit
+#define cv_hal_filterInit cv::rvv_hal::imgproc::filterInit
+#undef cv_hal_filter
+#define cv_hal_filter cv::rvv_hal::imgproc::filter
+#undef cv_hal_filterFree
+#define cv_hal_filterFree cv::rvv_hal::imgproc::filterFree
+
+/* ############ sepFilter ############ */
+
+int sepFilterInit(cvhalFilter2D **context, int src_type, int dst_type, int kernel_type, uchar* kernelx_data, int kernelx_length, uchar* kernely_data, int kernely_length, int anchor_x, int anchor_y, double delta, int borderType);
+int sepFilter(cvhalFilter2D *context, uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int full_width, int full_height, int offset_x, int offset_y);
+int sepFilterFree(cvhalFilter2D* context);
+
+#undef cv_hal_sepFilterInit
+#define cv_hal_sepFilterInit cv::rvv_hal::imgproc::sepFilterInit
+#undef cv_hal_sepFilter
+#define cv_hal_sepFilter cv::rvv_hal::imgproc::sepFilter
+#undef cv_hal_sepFilterFree
+#define cv_hal_sepFilterFree cv::rvv_hal::imgproc::sepFilterFree
+
+/* ############ morph ############ */
+
+int morphInit(cvhalFilter2D** context, int operation, int src_type, int dst_type, int /*max_width*/, int /*max_height*/, int kernel_type, uchar* kernel_data, size_t kernel_step, int kernel_width, int kernel_height, int anchor_x, int anchor_y, int borderType, const double borderValue[4], int iterations, bool /*allowSubmatrix*/, bool /*allowInplace*/);
+int morph(cvhalFilter2D* context, uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int src_full_width, int src_full_height, int src_roi_x, int src_roi_y, int /*dst_full_width*/, int /*dst_full_height*/, int /*dst_roi_x*/, int /*dst_roi_y*/);
+int morphFree(cvhalFilter2D* context);
+
+#undef cv_hal_morphInit
+#undef cv_hal_morph
+#undef cv_hal_morphFree
+#define cv_hal_morphInit cv::rvv_hal::imgproc::morphInit
+#define cv_hal_morph cv::rvv_hal::imgproc::morph
+#define cv_hal_morphFree cv::rvv_hal::imgproc::morphFree
+
+/* ############ gaussianBlur ############ */
+
+int gaussianBlurBinomial(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int depth, int cn, size_t margin_left, size_t margin_top, size_t margin_right, size_t margin_bottom, size_t ksize, int border_type);
+
+#undef cv_hal_gaussianBlurBinomial
+#define cv_hal_gaussianBlurBinomial cv::rvv_hal::imgproc::gaussianBlurBinomial
+
+/* ############ medianBlur ############ */
+
+int medianBlur(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int depth, int cn, int ksize);
+
+#undef cv_hal_medianBlur
+#define cv_hal_medianBlur cv::rvv_hal::imgproc::medianBlur
+
+/* ############ boxFilter ############ */
+
+int boxFilter(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int src_depth, int dst_depth, int cn, int margin_left, int margin_top, int margin_right, int margin_bottom, size_t ksize_width, size_t ksize_height, int anchor_x, int anchor_y, bool normalize, int border_type);
+
+#undef cv_hal_boxFilter
+#define cv_hal_boxFilter cv::rvv_hal::imgproc::boxFilter
+
+/* ############ bilateralFilter ############ */
+
+int bilateralFilter(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step,
+                    int width, int height, int depth, int cn, int d, double sigma_color,
+                    double sigma_space, int border_type);
+
+#undef cv_hal_bilateralFilter
+#define cv_hal_bilateralFilter cv::rvv_hal::imgproc::bilateralFilter
+
+/* ############ pyramid ############ */
+
+int pyrDown(const uchar* src_data, size_t src_step, int src_width, int src_height, uchar* dst_data, size_t dst_step, int dst_width, int dst_height, int depth, int cn, int border_type);
+int pyrUp(const uchar* src_data, size_t src_step, int src_width, int src_height, uchar* dst_data, size_t dst_step, int dst_width, int dst_height, int depth, int cn, int border_type);
+
+#undef cv_hal_pyrdown
+#define cv_hal_pyrdown cv::rvv_hal::imgproc::pyrDown
+#undef cv_hal_pyrup
+#define cv_hal_pyrup cv::rvv_hal::imgproc::pyrUp
+
+/* ############ cvtColor ############ */
+
+int cvtBGRtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, int dcn, bool swapBlue);
+int cvtGraytoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int dcn);
+int cvtBGRtoGray(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue);
+int cvtBGR5x5toBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int dcn, bool swapBlue, int greenBits);
+int cvtBGRtoBGR5x5(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int scn, bool swapBlue, int greenBits);
+int cvtBGR5x5toGray(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int greenBits);
+int cvtGraytoBGR5x5(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int greenBits);
+int cvtYUVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int dcn, bool swapBlue, bool isCbCr);
+int cvtBGRtoYUV(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue, bool isCbCr);
+int cvtOnePlaneYUVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int dst_width, int dst_height, int dcn, bool swapBlue, int uIdx, int yIdx);
+int cvtTwoPlaneYUVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int dst_width, int dst_height, int dcn, bool swapBlue, int uIdx);
+int cvtThreePlaneYUVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int dst_width, int dst_height, int dcn, bool swapBlue, int uIdx);
+int cvtOnePlaneBGRtoYUV(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int scn, bool swapBlue, int uIdx, int yIdx);
+int cvtBGRtoTwoPlaneYUV(const uchar * src_data, size_t src_step, uchar * y_data, size_t y_step, uchar * uv_data, size_t uv_step, int width, int height, int scn, bool swapBlue, int uIdx);
+int cvtBGRtoThreePlaneYUV(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int scn, bool swapBlue, int uIdx);
+int cvtHSVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int dcn, bool swapBlue, bool isFullRange, bool isHSV);
+int cvtBGRtoHSV(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue, bool isFullRange, bool isHSV);
+int cvtXYZtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int dcn, bool swapBlue);
+int cvtBGRtoXYZ(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue);
+int cvtLabtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int dcn, bool swapBlue, bool isLab, bool srgb);
+int cvtBGRtoLab(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue, bool isLab, bool srgb);
+
+#undef cv_hal_cvtBGRtoBGR
+#define cv_hal_cvtBGRtoBGR cv::rvv_hal::imgproc::cvtBGRtoBGR
+#undef cv_hal_cvtGraytoBGR
+#define cv_hal_cvtGraytoBGR cv::rvv_hal::imgproc::cvtGraytoBGR
+#undef cv_hal_cvtBGRtoGray
+#define cv_hal_cvtBGRtoGray cv::rvv_hal::imgproc::cvtBGRtoGray
+#undef cv_hal_cvtBGR5x5toBGR
+#define cv_hal_cvtBGR5x5toBGR cv::rvv_hal::imgproc::cvtBGR5x5toBGR
+#undef cv_hal_cvtBGRtoBGR5x5
+#define cv_hal_cvtBGRtoBGR5x5 cv::rvv_hal::imgproc::cvtBGRtoBGR5x5
+#undef cv_hal_cvtBGR5x5toGray
+#define cv_hal_cvtBGR5x5toGray cv::rvv_hal::imgproc::cvtBGR5x5toGray
+#undef cv_hal_cvtGraytoBGR5x5
+#define cv_hal_cvtGraytoBGR5x5 cv::rvv_hal::imgproc::cvtGraytoBGR5x5
+#undef cv_hal_cvtYUVtoBGR
+#define cv_hal_cvtYUVtoBGR cv::rvv_hal::imgproc::cvtYUVtoBGR
+#undef cv_hal_cvtBGRtoYUV
+#define cv_hal_cvtBGRtoYUV cv::rvv_hal::imgproc::cvtBGRtoYUV
+#undef cv_hal_cvtOnePlaneYUVtoBGR
+#define cv_hal_cvtOnePlaneYUVtoBGR cv::rvv_hal::imgproc::cvtOnePlaneYUVtoBGR
+#undef cv_hal_cvtTwoPlaneYUVtoBGR
+#define cv_hal_cvtTwoPlaneYUVtoBGR cv::rvv_hal::imgproc::cvtTwoPlaneYUVtoBGR
+#undef cv_hal_cvtThreePlaneYUVtoBGR
+#define cv_hal_cvtThreePlaneYUVtoBGR cv::rvv_hal::imgproc::cvtThreePlaneYUVtoBGR
+#undef cv_hal_cvtOnePlaneBGRtoYUV
+#define cv_hal_cvtOnePlaneBGRtoYUV cv::rvv_hal::imgproc::cvtOnePlaneBGRtoYUV
+#undef cv_hal_cvtBGRtoTwoPlaneYUV
+#define cv_hal_cvtBGRtoTwoPlaneYUV cv::rvv_hal::imgproc::cvtBGRtoTwoPlaneYUV
+#undef cv_hal_cvtBGRtoThreePlaneYUV
+#define cv_hal_cvtBGRtoThreePlaneYUV cv::rvv_hal::imgproc::cvtBGRtoThreePlaneYUV
+#undef cv_hal_cvtHSVtoBGR
+#define cv_hal_cvtHSVtoBGR cv::rvv_hal::imgproc::cvtHSVtoBGR
+#undef cv_hal_cvtBGRtoHSV
+#define cv_hal_cvtBGRtoHSV cv::rvv_hal::imgproc::cvtBGRtoHSV
+#undef cv_hal_cvtXYZtoBGR
+#define cv_hal_cvtXYZtoBGR cv::rvv_hal::imgproc::cvtXYZtoBGR
+#undef cv_hal_cvtBGRtoXYZ
+#define cv_hal_cvtBGRtoXYZ cv::rvv_hal::imgproc::cvtBGRtoXYZ
+#undef cv_hal_cvtLabtoBGR
+#define cv_hal_cvtLabtoBGR cv::rvv_hal::imgproc::cvtLabtoBGR
+#undef cv_hal_cvtBGRtoLab
+#define cv_hal_cvtBGRtoLab cv::rvv_hal::imgproc::cvtBGRtoLab
+
+/* ############ warp ############ */
+
+int remap32f(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height,
+             uchar *dst_data, size_t dst_step, int dst_width, int dst_height,
+             float* mapx, size_t mapx_step, float* mapy, size_t mapy_step,
+             int interpolation, int border_type, const double border_value[4]);
+int remap32fc2(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height,
+               uchar *dst_data, size_t dst_step, int dst_width, int dst_height,
+               float* map, size_t map_step, int interpolation, int border_type, const double border_value[4]);
+int remap16s(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height,
+             uchar *dst_data, size_t dst_step, int dst_width, int dst_height,
+             short* mapx, size_t mapx_step, ushort* mapy, size_t mapy_step,
+             int interpolation, int border_type, const double border_value[4]);
+
+// BUG: https://github.com/opencv/opencv/issues/27279
+// #undef cv_hal_remap32f
+// #define cv_hal_remap32f cv::cv_hal_rvv::imgproc::remap32f
+// #undef cv_hal_remap32fc2
+// #define cv_hal_remap32fc2 cv::cv_hal_rvv::imgproc::remap32fc2
+// #undef cv_hal_remap16s
+// #define cv_hal_remap16s cv::cv_hal_rvv::imgproc::remap16s
+
+int warpAffine(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, int dst_height, const double M[6], int interpolation, int borderType, const double borderValue[4]);
+int warpPerspective(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, int dst_height, const double M[9], int interpolation, int borderType, const double borderValue[4]);
+
+// BUG: https://github.com/opencv/opencv/issues/27280
+//#undef cv_hal_warpAffine
+//#define cv_hal_warpAffine cv::cv_hal_rvv::imgproc::warpAffine
+//#undef cv_hal_warpPerspective
+//#define cv_hal_warpPerspective cv::cv_hal_rvv::imgproc::warpPerspective
+
+/* ############ threshold ############ */
+
+int threshold(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int depth, int cn, double thresh, double maxValue, int thresholdType);
+int threshold_otsu(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int depth, double maxValue, int thresholdType, double* thresh);
+int adaptiveThreshold(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, double maxValue, int adaptiveMethod, int thresholdType, int blockSize, double C);
+
+// disabled since UI is fast enough, only called in threshold_otsu
+// #undef cv_hal_threshold
+// #define cv_hal_threshold cv::rvv_hal::imgproc::threshold
+#undef cv_hal_threshold_otsu
+#define cv_hal_threshold_otsu cv::rvv_hal::imgproc::threshold_otsu
+#undef cv_hal_adaptiveThreshold
+#define cv_hal_adaptiveThreshold cv::rvv_hal::imgproc::adaptiveThreshold
+
+/* ############ histogram ############ */
+
+int equalize_hist(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height);
+
+#undef cv_hal_equalize_hist
+#define cv_hal_equalize_hist cv::rvv_hal::imgproc::equalize_hist
+
+int calc_hist(const uchar* src_data, size_t src_step, int src_type, int src_width, int src_height, float* hist_data, int hist_size, const float** ranges, bool uniform, bool accumulate);
+
+#undef cv_hal_calcHist
+#define cv_hal_calcHist cv::rvv_hal::imgproc::calc_hist
+
+/* ############ resize ############ */
+
+int resize(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, int dst_height, double inv_scale_x, double inv_scale_y, int interpolation);
+
+#undef cv_hal_resize
+#define cv_hal_resize cv::rvv_hal::imgproc::resize
+
+/* ############ resize ############ */
+
+int integral(int depth, int sdepth, int sqdepth,
+             const uchar* src_data, size_t src_step,
+             uchar* sum_data, size_t sum_step,
+             uchar* sqsum_data, size_t sqsum_step,
+             uchar* tilted_data, [[maybe_unused]] size_t tilted_step,
+             int width, int height, int cn);
+
+#undef cv_hal_integral
+#define cv_hal_integral cv::rvv_hal::imgproc::integral
+
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+#if CV_HAL_RVV_071_ENABLED
+
+int cvtBGRtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, int dcn, bool swapBlue);
+#undef cv_hal_cvtBGRtoBGR
+#define cv_hal_cvtBGRtoBGR cv::rvv_hal::imgproc::cvtBGRtoBGR
+
+#endif // CV_HAL_RVV_071_ENABLED
+
+}}} // cv::rvv_hal::imgproc
+
+#endif // OPENCV_RVV_HAL_IMGPROC_HPP
--- a/hal/riscv-rvv/hal_rvv_1p0/types.hpp
+++ b/hal/riscv-rvv/hal_rvv_1p0/types.hpp
@ -4,13 +4,15 @@

 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.

-#ifndef OPENCV_HAL_RVV_TYPES_HPP_INCLUDED
-#define OPENCV_HAL_RVV_TYPES_HPP_INCLUDED
+#ifndef OPENCV_RVV_HAL_TYPES_HPP
+#define OPENCV_RVV_HAL_TYPES_HPP

 #include <riscv_vector.h>
 #include <type_traits>

-namespace cv { namespace cv_hal_rvv {
+namespace cv { namespace rvv_hal {
+
+#if CV_HAL_RVV_1P0_ENABLED

 enum RVV_LMUL
 {
@ -869,6 +871,8 @@ HAL_RVV_GROUP(RVV_F64M1, RVV_F64M8, f64, m1, m8)

 #undef HAL_RVV_GROUP

-}}  // namespace cv::cv_hal_rvv
+#endif // CV_HAL_RVV_1P0_ENABLED

-#endif //OPENCV_HAL_RVV_TYPES_HPP_INCLUDED
+}}  // namespace cv::rvv_hal
+
+#endif //OPENCV_RVV_HAL_TYPES_HPP
--- a/hal/riscv-rvv/rvv_hal.hpp
+++ b/hal/riscv-rvv/rvv_hal.hpp
@ -0,0 +1,31 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_HAL_RVV_HPP_INCLUDED
+#define OPENCV_HAL_RVV_HPP_INCLUDED
+
+#include "opencv2/core/base.hpp"
+#include "opencv2/core/utility.hpp"
+#include "opencv2/core/hal/interface.h"
+
+#if defined(__riscv_v) && __riscv_v == 1000000
+#define CV_HAL_RVV_1P0_ENABLED 1
+#else
+#define CV_HAL_RVV_1P0_ENABLED 0
+#endif
+
+#if defined(__riscv_v) && __riscv_v == 7000 && defined(__GNUC__) && __GNUC__ == 10 && __GNUC_MINOR__ == 4 && defined(__THEAD_VERSION__)
+#define CV_HAL_RVV_071_ENABLED 1
+#else
+#define CV_HAL_RVV_071_ENABLED 0
+#endif
+
+#if CV_HAL_RVV_1P0_ENABLED || CV_HAL_RVV_071_ENABLED
+#include <riscv_vector.h>
+#endif
+#include "include/types.hpp"
+#include "include/core.hpp"
+#include "include/imgproc.hpp"
+
+#endif // OPENCV_HAL_RVV_HPP_INCLUDED
--- a/hal/riscv-rvv/src/core/atan.cpp
+++ b/hal/riscv-rvv/src/core/atan.cpp
@ -0,0 +1,64 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level
+// directory of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
+
+#include "rvv_hal.hpp"
+#include "common.hpp"
+
+namespace cv { namespace rvv_hal { namespace core {
+
+#if CV_HAL_RVV_1P0_ENABLED
+
+int fast_atan_32(const float* y, const float* x, float* dst, size_t n, bool angle_in_deg)
+{
+    auto atan_params = angle_in_deg ? common::atan_params_deg : common::atan_params_rad;
+
+    for (size_t vl = 0; n > 0; n -= vl)
+    {
+        vl = __riscv_vsetvl_e32m4(n);
+
+        auto vy = __riscv_vle32_v_f32m4(y, vl);
+        auto vx = __riscv_vle32_v_f32m4(x, vl);
+
+        auto a = common::rvv_atan(vy, vx, vl, atan_params);
+
+        __riscv_vse32(dst, a, vl);
+
+        x += vl;
+        y += vl;
+        dst += vl;
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+int fast_atan_64(const double* y, const double* x, double* dst, size_t n, bool angle_in_deg)
+{
+    // this also uses float32 version, ref: mathfuncs_core.simd.hpp
+
+    auto atan_params = angle_in_deg ? common::atan_params_deg : common::atan_params_rad;
+
+    for (size_t vl = 0; n > 0; n -= vl)
+    {
+        vl = __riscv_vsetvl_e64m8(n);
+
+        auto vy = __riscv_vfncvt_f(__riscv_vle64_v_f64m8(y, vl), vl);
+        auto vx = __riscv_vfncvt_f(__riscv_vle64_v_f64m8(x, vl), vl);
+
+        auto a = common::rvv_atan(vy, vx, vl, atan_params);
+
+        __riscv_vse64(dst, __riscv_vfwcvt_f(a, vl), vl);
+
+        x += vl;
+        y += vl;
+        dst += vl;
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+}}} // cv::rvv_hal::core
--- a/hal/riscv-rvv/hal_rvv_1p0/cart_to_polar.hpp
+++ b/hal/riscv-rvv/hal_rvv_1p0/cart_to_polar.hpp
@ -4,27 +4,20 @@

 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.

-#ifndef OPENCV_HAL_RVV_CART_TO_POLAR_HPP_INCLUDED
-#define OPENCV_HAL_RVV_CART_TO_POLAR_HPP_INCLUDED
+#include "rvv_hal.hpp"
+#include "common.hpp"

-#include <riscv_vector.h>
+namespace cv { namespace rvv_hal { namespace core {

-#include "hal_rvv_1p0/atan.hpp"
-#include "hal_rvv_1p0/sqrt.hpp"
-#include "hal_rvv_1p0/types.hpp"
+#if CV_HAL_RVV_1P0_ENABLED

-namespace cv { namespace cv_hal_rvv {
-
-#undef cv_hal_cartToPolar32f
-#define cv_hal_cartToPolar32f cv::cv_hal_rvv::cartToPolar<cv::cv_hal_rvv::RVV_F32M4>
-#undef cv_hal_cartToPolar64f
-#define cv_hal_cartToPolar64f cv::cv_hal_rvv::cartToPolar<cv::cv_hal_rvv::RVV_F64M8>
+namespace {

 template <typename RVV_T, typename T = typename RVV_T::ElemType>
 inline int cartToPolar(const T* x, const T* y, T* mag, T* angle, int len, bool angleInDegrees)
 {
    using CalType = RVV_SameLen<float, RVV_T>;
-    auto atan_params = angleInDegrees ? detail::atan_params_deg : detail::atan_params_rad;
+    auto atan_params = angleInDegrees ? common::atan_params_deg : common::atan_params_rad;
    size_t vl;
    for (; len > 0; len -= (int)vl, x += vl, y += vl, mag += vl, angle += vl)
    {
@ -33,16 +26,25 @@ inline int cartToPolar(const T* x, const T* y, T* mag, T* angle, int len, bool a
        auto vx = CalType::cast(RVV_T::vload(x, vl), vl);
        auto vy = CalType::cast(RVV_T::vload(y, vl), vl);

-        auto vmag = detail::sqrt<2>(__riscv_vfmadd(vx, vx, __riscv_vfmul(vy, vy, vl), vl), vl);
+        auto vmag = common::sqrt<2>(__riscv_vfmadd(vx, vx, __riscv_vfmul(vy, vy, vl), vl), vl);
        RVV_T::vstore(mag, RVV_T::cast(vmag, vl), vl);

-        auto vangle = detail::rvv_atan(vy, vx, vl, atan_params);
+        auto vangle = common::rvv_atan(vy, vx, vl, atan_params);
        RVV_T::vstore(angle, RVV_T::cast(vangle, vl), vl);
    }

    return CV_HAL_ERROR_OK;
 }

-}}  // namespace cv::cv_hal_rvv
+} // anonymous

-#endif  // OPENCV_HAL_RVV_CART_TO_POLAR_HPP_INCLUDED
+int cartToPolar32f(const float* x, const float* y, float* mag, float* angle, int len, bool angleInDegrees) {
+    return cartToPolar<RVV_F32M4>(x, y, mag, angle, len, angleInDegrees);
+}
+int cartToPolar64f(const double* x, const double* y, double* mag, double* angle, int len, bool angleInDegrees) {
+    return cartToPolar<RVV_F64M8>(x, y, mag, angle, len, angleInDegrees);
+}
+
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+}}}  // cv::rvv_hal::core
--- a/hal/riscv-rvv/hal_rvv_1p0/cholesky.hpp
+++ b/hal/riscv-rvv/hal_rvv_1p0/cholesky.hpp
@ -4,20 +4,15 @@

 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.

-#ifndef OPENCV_HAL_RVV_CHOLESKY_HPP_INCLUDED
-#define OPENCV_HAL_RVV_CHOLESKY_HPP_INCLUDED
-
+#include "rvv_hal.hpp"
 #include <cmath>
 #include <limits>
-#include <riscv_vector.h>
-#include "hal_rvv_1p0/types.hpp"

-namespace cv { namespace cv_hal_rvv { namespace cholesky {
+namespace cv { namespace rvv_hal { namespace core {

-#undef cv_hal_Cholesky32f
-#define cv_hal_Cholesky32f cv::cv_hal_rvv::cholesky::Cholesky<cv::cv_hal_rvv::RVV_F32M4>
-#undef cv_hal_Cholesky64f
-#define cv_hal_Cholesky64f cv::cv_hal_rvv::cholesky::Cholesky<cv::cv_hal_rvv::RVV_F64M4>
+#if CV_HAL_RVV_1P0_ENABLED
+
+namespace {

 // the algorithm is copied from core/src/matrix_decomp.cpp,
 // in the function template static int cv::CholImpl
@ -119,6 +114,15 @@ inline int Cholesky(T* src1, size_t src1_step, int m, T* src2, size_t src2_step,
    return CV_HAL_ERROR_OK;
 }

-}}}
+} // anonymous

-#endif
+int Cholesky32f(float* src1, size_t src1_step, int m, float* src2, size_t src2_step, int n, bool* info) {
+    return Cholesky<RVV_F32M4>(src1, src1_step, m, src2, src2_step, n, info);
+}
+int Cholesky64f(double* src1, size_t src1_step, int m, double* src2, size_t src2_step, int n, bool* info) {
+    return Cholesky<RVV_F64M4>(src1, src1_step, m, src2, src2_step, n, info);
+}
+
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+}}} // cv::rvv_hal::core
--- a/hal/riscv-rvv/src/core/common.hpp
+++ b/hal/riscv-rvv/src/core/common.hpp
@ -0,0 +1,195 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2025, SpaceMIT Inc., all rights reserved.
+// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
+// Third party copyrights are property of their respective owners.
+
+#ifndef OPENCV_HAL_RVV_CORE_COMMON_HPP_INCLUDED
+#define OPENCV_HAL_RVV_CORE_COMMON_HPP_INCLUDED
+
+#include <riscv_vector.h>
+#include <cmath>
+#include <cfloat>
+
+namespace cv { namespace rvv_hal { namespace core { namespace common {
+
+#if CV_HAL_RVV_1P0_ENABLED
+
+#define CV_HAL_RVV_NOOP(a) (a)
+
+// ############ abs ############
+
+#define CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(_Tpvs, _Tpvd, shift, suffix) \
+    inline _Tpvd __riscv_vabs(const _Tpvs& v, const int vl) { \
+        _Tpvs mask = __riscv_vsra(v, shift, vl); \
+        _Tpvs v_xor = __riscv_vxor(v, mask, vl); \
+        return __riscv_vreinterpret_##suffix( \
+            __riscv_vsub(v_xor, mask, vl) \
+        ); \
+    }
+
+CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint8m2_t,  vuint8m2_t,  7,  u8m2)
+CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint8m8_t,  vuint8m8_t,  7,  u8m8)
+CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint16m4_t, vuint16m4_t, 15, u16m4)
+CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint16m8_t, vuint16m8_t, 15, u16m8)
+CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint32m4_t, vuint32m4_t, 31, u32m4)
+CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABS(vint32m8_t, vuint32m8_t, 31, u32m8)
+
+// ############ absdiff ############
+
+#define CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(_Tpvs, _Tpvd, cast, sub, max, min) \
+    inline _Tpvd __riscv_vabd(const _Tpvs& v1, const _Tpvs& v2, const int vl) { \
+        return cast(__riscv_##sub(__riscv_##max(v1, v2, vl), __riscv_##min(v1, v2, vl), vl)); \
+    }
+
+CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vuint8m4_t, vuint8m4_t, CV_HAL_RVV_NOOP, vsub, vmaxu, vminu)
+CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vuint8m8_t, vuint8m8_t, CV_HAL_RVV_NOOP, vsub, vmaxu, vminu)
+CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vuint16m2_t, vuint16m2_t, CV_HAL_RVV_NOOP, vsub, vmaxu, vminu)
+CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vuint16m8_t, vuint16m8_t, CV_HAL_RVV_NOOP, vsub, vmaxu, vminu)
+
+CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint8m4_t, vuint8m4_t, __riscv_vreinterpret_u8m4, vsub, vmax, vmin)
+CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint8m8_t, vuint8m8_t, __riscv_vreinterpret_u8m8, vsub, vmax, vmin)
+CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint16m2_t, vuint16m2_t, __riscv_vreinterpret_u16m2, vsub, vmax, vmin)
+CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint16m8_t, vuint16m8_t, __riscv_vreinterpret_u16m8, vsub, vmax, vmin)
+CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint32m4_t, vuint32m4_t, __riscv_vreinterpret_u32m4, vsub, vmax, vmin)
+CV_HAL_RVV_COMMON_CUSTOM_INTRIN_ABSDIFF(vint32m8_t, vuint32m8_t, __riscv_vreinterpret_u32m8, vsub, vmax, vmin)
+
+// ############ reciprocal ############
+
+inline vfloat32m4_t __riscv_vfrec(const vfloat32m4_t &x, const int vl) {
+    auto rec = __riscv_vfrec7(x, vl);
+    auto cls = __riscv_vfclass(rec, vl);
+    auto m = __riscv_vmseq(__riscv_vand(cls, 0b10111000, vl), 0, vl);
+    auto two = __riscv_vfmv_v_f_f32m4(2.f, vl);
+    rec = __riscv_vfmul_mu(m, rec, __riscv_vfnmsac(two, x, rec, vl), rec, vl);
+    rec = __riscv_vfmul_mu(m, rec, __riscv_vfnmsac(two, x, rec, vl), rec, vl);
+    return rec;
+}
+
+// ############ atan ############
+
+// ref: mathfuncs_core.simd.hpp
+static constexpr float pi = CV_PI;
+
+struct AtanParams
+{
+    float p1, p3, p5, p7, angle_90;
+};
+
+static constexpr AtanParams atan_params_rad {
+    0.9997878412794807F,
+    -0.3258083974640975F,
+    0.1555786518463281F,
+    -0.04432655554792128F,
+    90.F * (pi / 180.F)};
+static constexpr AtanParams atan_params_deg {
+    atan_params_rad.p1 * (180 / pi),
+    atan_params_rad.p3 * (180 / pi),
+    atan_params_rad.p5 * (180 / pi),
+    atan_params_rad.p7 * (180 / pi),
+    90.F};
+
+template <typename VEC_T>
+__attribute__((always_inline)) inline VEC_T
+    rvv_atan(VEC_T vy, VEC_T vx, size_t vl, const AtanParams& params)
+{
+    const auto ax = __riscv_vfabs(vx, vl);
+    const auto ay = __riscv_vfabs(vy, vl);
+    // Reciprocal Estimate (vfrec7) is not accurate enough to pass the test of cartToPolar.
+    const auto c = __riscv_vfdiv(__riscv_vfmin(ax, ay, vl),
+                                 __riscv_vfadd(__riscv_vfmax(ax, ay, vl), FLT_EPSILON, vl),
+                                 vl);
+    const auto c2 = __riscv_vfmul(c, c, vl);
+
+    // Using vfmadd only results in about a 2% performance improvement, but it occupies 3 additional
+    // M4 registers. (Performance test on phase32f::VectorLength::1048576: time decreased
+    // from 5.952ms to 5.805ms on Muse Pi)
+    // Additionally, when registers are nearly fully utilized (though not yet exhausted), the
+    // compiler is likely to fail to optimize and may introduce slower memory access (e.g., in
+    // cv::rvv_hal::fast_atan_64).
+    // Saving registers can also make this function more reusable in other contexts.
+    // Therefore, vfmadd is not used here.
+    auto a = __riscv_vfadd(__riscv_vfmul(c2, params.p7, vl), params.p5, vl);
+    a = __riscv_vfadd(__riscv_vfmul(c2, a, vl), params.p3, vl);
+    a = __riscv_vfadd(__riscv_vfmul(c2, a, vl), params.p1, vl);
+    a = __riscv_vfmul(a, c, vl);
+
+    a = __riscv_vfrsub_mu(__riscv_vmflt(ax, ay, vl), a, a, params.angle_90, vl);
+    a = __riscv_vfrsub_mu(__riscv_vmflt(vx, 0.F, vl), a, a, params.angle_90 * 2, vl);
+    a = __riscv_vfrsub_mu(__riscv_vmflt(vy, 0.F, vl), a, a, params.angle_90 * 4, vl);
+
+    return a;
+}
+
+// ############ sqrt ############
+
+template <typename RVV_T>
+struct Sqrt32f
+{
+    using T = RVV_T;
+    static constexpr size_t iter_times = 2;
+};
+
+template <typename RVV_T>
+struct Sqrt64f
+{
+    using T = RVV_T;
+    static constexpr size_t iter_times = 3;
+};
+
+// Newton-Raphson method
+// Use 4 LMUL registers
+template <size_t iter_times, typename VEC_T>
+inline VEC_T sqrt(VEC_T x, size_t vl)
+{
+    auto x2 = __riscv_vfmul(x, 0.5, vl);
+    auto y = __riscv_vfrsqrt7(x, vl);
+#ifdef __clang__
+#pragma unroll
+#endif
+    for (size_t i = 0; i < iter_times; i++)
+    {
+        auto t = __riscv_vfmul(y, y, vl);
+        t = __riscv_vfmul(t, x2, vl);
+        t = __riscv_vfrsub(t, 1.5, vl);
+        y = __riscv_vfmul(t, y, vl);
+    }
+    // just to prevent the compiler from calculating mask before the iteration, which will run out
+    // of registers and cause memory access.
+    asm volatile("" ::: "memory");
+    auto classified = __riscv_vfclass(x, vl);
+    // block -0, +0, positive subnormal number, +inf
+    auto mask = __riscv_vmseq(__riscv_vand(classified, 0b10111000, vl), 0, vl);
+    return __riscv_vfmul_mu(mask, x, x, y, vl);
+}
+
+// Newton-Raphson method
+// Use 3 LMUL registers and 1 mask register
+template <size_t iter_times, typename VEC_T>
+inline VEC_T invSqrt(VEC_T x, size_t vl)
+{
+    auto classified = __riscv_vfclass(x, vl);
+    // block -0, +0, positive subnormal number, +inf
+    auto mask = __riscv_vmseq(__riscv_vand(classified, 0b10111000, vl), 0, vl);
+    auto x2 = __riscv_vfmul(x, 0.5, vl);
+    auto y = __riscv_vfrsqrt7(x, vl);
+#ifdef __clang__
+#pragma unroll
+#endif
+    for (size_t i = 0; i < iter_times; i++)
+    {
+        auto t = __riscv_vfmul(y, y, vl);
+        t = __riscv_vfmul(t, x2, vl);
+        t = __riscv_vfrsub(t, 1.5, vl);
+        y = __riscv_vfmul_mu(mask, y, t, y, vl);
+    }
+    return y;
+}
+
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+}}}} // cv::rvv_hal::core::common
+
+#endif // OPENCV_HAL_RVV_CORE_COMMON_HPP_INCLUDED
--- a/hal/riscv-rvv/hal_rvv_1p0/compare.hpp
+++ b/hal/riscv-rvv/hal_rvv_1p0/compare.hpp
@ -5,12 +5,11 @@
 // Copyright (C) 2025, SpaceMIT Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.

-#ifndef OPENCV_HAL_RVV_COMPARE_HPP_INCLUDED
-#define OPENCV_HAL_RVV_COMPARE_HPP_INCLUDED
+#include "rvv_hal.hpp"

-#include "types.hpp"
+namespace cv { namespace rvv_hal { namespace core {

-namespace cv { namespace cv_hal_rvv { namespace compare {
+#if CV_HAL_RVV_1P0_ENABLED

 namespace {

@ -90,23 +89,6 @@ int compare_impl(const _Tps *src1_data, size_t src1_step, const _Tps *src2_data,
    return CV_HAL_ERROR_OK;
 }

-} // anonymous
-
-#undef cv_hal_cmp8u
-#define cv_hal_cmp8u cv::cv_hal_rvv::compare::compare<uchar>
-#undef cv_hal_cmp8s
-#define cv_hal_cmp8s cv::cv_hal_rvv::compare::compare<schar>
-#undef cv_hal_cmp16u
-#define cv_hal_cmp16u cv::cv_hal_rvv::compare::compare<ushort>
-#undef cv_hal_cmp16s
-#define cv_hal_cmp16s cv::cv_hal_rvv::compare::compare<short>
-#undef cv_hal_cmp32s
-#define cv_hal_cmp32s cv::cv_hal_rvv::compare::compare<int>
-#undef cv_hal_cmp32f
-#define cv_hal_cmp32f cv::cv_hal_rvv::compare::compare<float>
-// #undef cv_hal_cmp64f
-// #define cv_hal_cmp64f cv::cv_hal_rvv::compare::compare<double>
-
 template <typename _Tps> inline
 int compare(const _Tps *src1_data, size_t src1_step, const _Tps *src2_data, size_t src2_step,
            uchar *dst_data, size_t dst_step, int width, int height, int operation) {
@ -121,6 +103,27 @@ int compare(const _Tps *src1_data, size_t src1_step, const _Tps *src2_data, size
    }
 }

-}}} // cv::cv_hal_rvv::compare
+} // namespace anonymous

-#endif // OPENCV_HAL_RVV_COMPARE_HPP_INCLUDED
+int cmp8u(const uchar *src1_data, size_t src1_step, const uchar *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, int operation) {
+    return compare<uchar>(src1_data, src1_step, src2_data, src2_step, dst_data, dst_step, width, height, operation);
+}
+int cmp8s(const schar *src1_data, size_t src1_step, const schar *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, int operation) {
+    return compare<schar>(src1_data, src1_step, src2_data, src2_step, dst_data, dst_step, width, height, operation);
+}
+int cmp16u(const ushort *src1_data, size_t src1_step, const ushort *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, int operation) {
+    return compare<ushort>(src1_data, src1_step, src2_data, src2_step, dst_data, dst_step, width, height, operation);
+}
+int cmp16s(const short *src1_data, size_t src1_step, const short *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, int operation) {
+    return compare<short>(src1_data, src1_step, src2_data, src2_step, dst_data, dst_step, width, height, operation);
+}
+int cmp32s(const int *src1_data, size_t src1_step, const int *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, int operation) {
+    return compare<int>(src1_data, src1_step, src2_data, src2_step, dst_data, dst_step, width, height, operation);
+}
+int cmp32f(const float *src1_data, size_t src1_step, const float *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, int operation) {
+    return compare<float>(src1_data, src1_step, src2_data, src2_step, dst_data, dst_step, width, height, operation);
+}
+
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+}}} // cv::rvv_hal::core
--- a/hal/riscv-rvv/hal_rvv_1p0/convert_scale.hpp
+++ b/hal/riscv-rvv/hal_rvv_1p0/convert_scale.hpp
@ -4,15 +4,11 @@

 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.

-#ifndef OPENCV_HAL_RVV_CONVERT_SCALE_HPP_INCLUDED
-#define OPENCV_HAL_RVV_CONVERT_SCALE_HPP_INCLUDED
+#include "rvv_hal.hpp"

-#include <riscv_vector.h>
+namespace cv { namespace rvv_hal { namespace core {

-namespace cv { namespace cv_hal_rvv {
-
-#undef cv_hal_convertScale
-#define cv_hal_convertScale cv::cv_hal_rvv::convertScale
+#if CV_HAL_RVV_1P0_ENABLED

 inline int convertScale_8U8U(const uchar* src, size_t src_step, uchar* dst, size_t dst_step, int width, int height, double alpha, double beta)
 {
@ -89,8 +85,8 @@ inline int convertScale_32F32F(const uchar* src, size_t src_step, uchar* dst, si
    return CV_HAL_ERROR_OK;
 }

-inline int convertScale(const uchar* src, size_t src_step, uchar* dst, size_t dst_step, int width, int height,
-                        int sdepth, int ddepth, double alpha, double beta)
+int convertScale(const uchar* src, size_t src_step, uchar* dst, size_t dst_step,
+                 int width, int height, int sdepth, int ddepth, double alpha, double beta)
 {
    if (!dst)
        return CV_HAL_ERROR_OK;
@ -118,6 +114,6 @@ inline int convertScale(const uchar* src, size_t src_step, uchar* dst, size_t ds
    return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }

-}}
+#endif // CV_HAL_RVV_1P0_ENABLED

-#endif
+}}} // cv::rvv_hal::core
--- a/hal/riscv-rvv/hal_rvv_1p0/copy_mask.hpp
+++ b/hal/riscv-rvv/hal_rvv_1p0/copy_mask.hpp
@ -5,15 +5,11 @@
 // Copyright (C) 2025, SpaceMIT Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.

-#ifndef OPENCV_HAL_RVV_COPY_MASK_HPP_INCLUDED
-#define OPENCV_HAL_RVV_COPY_MASK_HPP_INCLUDED
+#include "rvv_hal.hpp"

-#include <riscv_vector.h>
+namespace cv { namespace rvv_hal { namespace core {

-namespace cv { namespace cv_hal_rvv {
-
-#undef cv_hal_copyToMasked
-#define cv_hal_copyToMasked cv::cv_hal_rvv::copyToMasked
+#if CV_HAL_RVV_1P0_ENABLED

 namespace {

@ -100,71 +96,35 @@ static int copyToMasked_e64c4(const uchar *src_data, size_t src_step,
 } // anonymous

 using CopyToMaskedFunc = int (*)(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int);
-inline int copyToMasked(const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step, int width, int height,
+int copyToMasked(const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step, int width, int height,
                 int type, const uchar *mask_data, size_t mask_step, int mask_type) {
-    int depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
+    int cn = CV_MAT_CN(type);
    int mdepth = CV_MAT_DEPTH(mask_type), mcn = CV_MAT_CN(mask_type);

    if (mcn > 1 || mdepth != CV_8U) {
        return CV_HAL_ERROR_NOT_IMPLEMENTED;
    }

-    CopyToMaskedFunc func = nullptr;
-    switch (depth) {
-        case CV_8U: {}
-        case CV_8S: switch (cn) {
-            case 1: func = copyToMasked_e8c1;  break;
-            case 2: func = copyToMasked_e16c1; break;
-            case 3: func = copyToMasked_e8c3;  break;
-            case 4: func = copyToMasked_e32c1; break;
-            case 6: func = copyToMasked_e16c3; break;
-            case 8: func = copyToMasked_e64c1; break;
-            default: func = nullptr;
-        }; break;
-        case CV_16U: {}
-        case CV_16S: switch (cn) {
-            case 1: func = copyToMasked_e16c1; break;
-            case 2: func = copyToMasked_e32c1; break;
-            case 3: func = copyToMasked_e16c3; break;
-            case 4: func = copyToMasked_e64c1; break;
-            case 6: func = copyToMasked_e32c3; break;
-            case 8: func = copyToMasked_e64c2; break;
-            default: func = nullptr; break;
-        }; break;
-        case CV_32S: {}
-        case CV_32F: switch (cn) {
-            case 1: func = copyToMasked_e32c1; break;
-            case 2: func = copyToMasked_e64c1; break;
-            case 3: func = copyToMasked_e32c3; break;
-            case 4: func = copyToMasked_e64c2; break;
-            case 6: func = copyToMasked_e64c3; break;
-            case 8: func = copyToMasked_e64c4; break;
-            default: func = nullptr; break;
-        }; break;
-        case CV_64F: switch (cn) {
-            case 1: func = copyToMasked_e64c1; break;
-            case 2: func = copyToMasked_e64c2; break;
-            case 3: func = copyToMasked_e64c3; break;
-            case 4: func = copyToMasked_e64c4; break;
-            default: func = nullptr; break;
-        }; break;
-        default: func = nullptr;
-    }
-
+    static CopyToMaskedFunc tab[] = {
+        0, copyToMasked_e8c1, copyToMasked_e16c1, copyToMasked_e8c3,
+        copyToMasked_e32c1, 0, copyToMasked_e16c3, 0,
+        copyToMasked_e64c1, 0, 0, 0,
+        copyToMasked_e32c3, 0, 0, 0,
+        copyToMasked_e64c2, 0, 0, 0,
+        0, 0, 0, 0,
+        copyToMasked_e64c3, 0, 0, 0,
+        0, 0, 0, 0,
+        copyToMasked_e64c4
+    };
+    size_t elem_size = CV_ELEM_SIZE(type);
+    CopyToMaskedFunc func = elem_size <= 32 ? tab[elem_size] : nullptr;
    if (func == nullptr) {
        return CV_HAL_ERROR_NOT_IMPLEMENTED;
    }

-    static const size_t elem_size_tab[CV_DEPTH_MAX] = {
-        sizeof(uchar),   sizeof(schar),
-        sizeof(ushort),  sizeof(short),
-        sizeof(int),     sizeof(float),
-        sizeof(int64_t), 0,
-    };
-    CV_Assert(elem_size_tab[depth]);
-
-    bool src_continuous = (src_step == width * elem_size_tab[depth] * cn || (src_step != width * elem_size_tab[depth] * cn && height == 1));
-    bool dst_continuous = (dst_step == width * elem_size_tab[depth] * cn || (dst_step != width * elem_size_tab[depth] * cn && height == 1));
+    size_t elem_size1 = static_cast<size_t>(CV_ELEM_SIZE1(type));
+    bool src_continuous = (src_step == width * elem_size1 * cn || (src_step != width * elem_size1 * cn && height == 1));
+    bool dst_continuous = (dst_step == width * elem_size1 * cn || (dst_step != width * elem_size1 * cn && height == 1));
    bool mask_continuous = (mask_step == static_cast<size_t>(width));
    size_t nplanes = 1;
    int _width = width, _height = height;
@ -189,6 +149,6 @@ inline int copyToMasked(const uchar *src_data, size_t src_step, uchar *dst_data,
    return CV_HAL_ERROR_OK;
 }

-}} // cv::cv_hal_rvv
+#endif // CV_HAL_RVV_1P0_ENABLED

-#endif
+}}} // cv::rvv_hal::core
--- a/hal/riscv-rvv/src/core/div.cpp
+++ b/hal/riscv-rvv/src/core/div.cpp
@ -0,0 +1,299 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2025, SpaceMIT Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+
+#include "rvv_hal.hpp"
+#include "common.hpp"
+#include <limits>
+
+namespace cv { namespace rvv_hal { namespace core {
+
+#if CV_HAL_RVV_1P0_ENABLED
+
+namespace {
+
+inline size_t setvl(int l) { return __riscv_vsetvl_e8m1(l); }
+
+inline   vuint8m1_t vle(const uint8_t  *p, int vl) { return __riscv_vle8_v_u8m1(p, vl); }
+inline    vint8m1_t vle(const int8_t   *p, int vl) { return __riscv_vle8_v_i8m1(p, vl); }
+inline  vuint16m2_t vle(const uint16_t *p, int vl) { return __riscv_vle16_v_u16m2(p, vl); }
+inline   vint16m2_t vle(const int16_t  *p, int vl) { return __riscv_vle16_v_i16m2(p, vl); }
+inline   vint32m4_t vle(const int      *p, int vl) { return __riscv_vle32_v_i32m4(p, vl); }
+inline vfloat32m4_t vle(const float    *p, int vl) { return __riscv_vle32_v_f32m4(p, vl); }
+
+inline void vse(uint8_t  *p, const   vuint8m1_t &v, int vl) { __riscv_vse8(p, v, vl); }
+inline void vse(int8_t   *p, const    vint8m1_t &v, int vl) { __riscv_vse8(p, v, vl); }
+inline void vse(uint16_t *p, const  vuint16m2_t &v, int vl) { __riscv_vse16(p, v, vl); }
+inline void vse(int16_t  *p, const   vint16m2_t &v, int vl) { __riscv_vse16(p, v, vl); }
+inline void vse(int      *p, const   vint32m4_t &v, int vl) { __riscv_vse32(p, v, vl); }
+inline void vse(float    *p, const vfloat32m4_t &v, int vl) { __riscv_vse32(p, v, vl); }
+
+inline vuint16m2_t ext(const  vuint8m1_t &v, const int vl) { return __riscv_vzext_vf2(v, vl); }
+inline  vint16m2_t ext(const   vint8m1_t &v, const int vl) { return __riscv_vsext_vf2(v, vl); }
+
+inline  vuint8m1_t nclip(const vuint16m2_t &v, const int vl) { return __riscv_vnclipu(v, 0, __RISCV_VXRM_RNU, vl); }
+inline   vint8m1_t nclip(const  vint16m2_t &v, const int vl) { return __riscv_vnclip(v, 0, __RISCV_VXRM_RNU, vl); }
+
+template <typename VT> inline
+VT div_sat(const VT &v1, const VT &v2, const float scale, const int vl) {
+    return nclip(div_sat(ext(v1, vl), ext(v2, vl), scale, vl), vl);
+}
+template <> inline
+vint16m2_t div_sat(const vint16m2_t &v1, const vint16m2_t &v2, const float scale, const int vl) {
+    auto f1 = __riscv_vfwcvt_f(v1, vl);
+    auto f2 = __riscv_vfwcvt_f(v2, vl);
+    auto res = __riscv_vfmul(f1, __riscv_vfmul(common::__riscv_vfrec(f2, vl), scale, vl), vl);
+    return __riscv_vfncvt_x(res, vl);
+}
+template <> inline
+vuint16m2_t div_sat(const vuint16m2_t &v1, const vuint16m2_t &v2, const float scale, const int vl) {
+    auto f1 = __riscv_vfwcvt_f(v1, vl);
+    auto f2 = __riscv_vfwcvt_f(v2, vl);
+    auto res = __riscv_vfmul(f1, __riscv_vfmul(common::__riscv_vfrec(f2, vl), scale, vl), vl);
+    return __riscv_vfncvt_xu(res, vl);
+}
+template <> inline
+vint32m4_t div_sat(const vint32m4_t &v1, const vint32m4_t &v2, const float scale, const int vl) {
+    auto f1 = __riscv_vfcvt_f(v1, vl);
+    auto f2 = __riscv_vfcvt_f(v2, vl);
+    auto res = __riscv_vfmul(f1, __riscv_vfmul(common::__riscv_vfrec(f2, vl), scale, vl), vl);
+    return __riscv_vfcvt_x(res, vl);
+}
+template <> inline
+vuint32m4_t div_sat(const vuint32m4_t &v1, const vuint32m4_t &v2, const float scale, const int vl) {
+    auto f1 = __riscv_vfcvt_f(v1, vl);
+    auto f2 = __riscv_vfcvt_f(v2, vl);
+    auto res = __riscv_vfmul(f1, __riscv_vfmul(common::__riscv_vfrec(f2, vl), scale, vl), vl);
+    return __riscv_vfcvt_xu(res, vl);
+}
+
+template <typename VT> inline
+VT recip_sat(const VT &v, const float scale, const int vl) {
+    return nclip(recip_sat(ext(v, vl), scale, vl), vl);
+}
+template <> inline
+vint16m2_t recip_sat(const vint16m2_t &v, const float scale, const int vl) {
+    auto f = __riscv_vfwcvt_f(v, vl);
+    auto res = __riscv_vfmul(common::__riscv_vfrec(f, vl), scale, vl);
+    return __riscv_vfncvt_x(res, vl);
+}
+template <> inline
+vuint16m2_t recip_sat(const vuint16m2_t &v, const float scale, const int vl) {
+    auto f = __riscv_vfwcvt_f(v, vl);
+    auto res = __riscv_vfmul(common::__riscv_vfrec(f, vl), scale, vl);
+    return __riscv_vfncvt_xu(res, vl);
+}
+template <> inline
+vint32m4_t recip_sat(const vint32m4_t &v, const float scale, const int vl) {
+    auto f = __riscv_vfcvt_f(v, vl);
+    auto res = __riscv_vfmul(common::__riscv_vfrec(f, vl), scale, vl);
+    return __riscv_vfcvt_x(res, vl);
+}
+template <> inline
+vuint32m4_t recip_sat(const vuint32m4_t &v, const float scale, const int vl) {
+    auto f = __riscv_vfcvt_f(v, vl);
+    auto res = __riscv_vfmul(common::__riscv_vfrec(f, vl), scale, vl);
+    return __riscv_vfcvt_xu(res, vl);
+}
+
+// Implementation
+
+template <typename ST> inline
+int div(const ST *src1, size_t step1, const ST *src2, size_t step2,
+         ST *dst, size_t step, int width, int height, float scale) {
+    float max_fval = static_cast<float>(std::numeric_limits<ST>::max());
+    if (scale == 0.f || ((scale * max_fval) <  1.f && (scale * max_fval) > -1.f)) {
+        for (int h = 0; h < height; h++) {
+            ST *dst_h = reinterpret_cast<ST*>((uchar*)dst + h * step);
+            std::memset(dst_h, 0, sizeof(ST) * width);
+        }
+        return CV_HAL_ERROR_OK;
+    }
+
+    for (int h = 0; h < height; h++) {
+        const ST *src1_h = reinterpret_cast<const ST*>((const uchar*)src1 + h * step1);
+        const ST *src2_h = reinterpret_cast<const ST*>((const uchar*)src2 + h * step2);
+        ST *dst_h = reinterpret_cast<ST*>((uchar*)dst + h * step);
+
+        int vl;
+        for (int w = 0; w < width; w += vl) {
+            vl = setvl(width - w);
+
+            auto v1 = vle(src1_h + w, vl);
+            auto v2 = vle(src2_h + w, vl);
+
+            auto mask = __riscv_vmseq(v2, 0, vl);
+            vse(dst_h + w, __riscv_vmerge(div_sat(v1, v2, scale, vl), 0, mask, vl), vl);
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+template <>
+int div(const float *src1, size_t step1, const float *src2, size_t step2,
+        float *dst, size_t step, int width, int height, float scale) {
+    if (scale == 0.f) {
+        for (int h = 0; h < height; h++) {
+            float *dst_h = reinterpret_cast<float*>((uchar*)dst + h * step);
+            std::memset(dst_h, 0, sizeof(float) * width);
+        }
+        return CV_HAL_ERROR_OK;
+    }
+
+    if (std::fabs(scale - 1.f) < FLT_EPSILON) {
+        for (int h = 0; h < height; h++) {
+            const float *src1_h = reinterpret_cast<const float*>((const uchar*)src1 + h * step1);
+            const float *src2_h = reinterpret_cast<const float*>((const uchar*)src2 + h * step2);
+            float *dst_h = reinterpret_cast<float*>((uchar*)dst + h * step);
+
+            int vl;
+            for (int w = 0; w < width; w += vl) {
+                vl = setvl(width - w);
+
+                auto v1 = vle(src1_h + w, vl);
+                auto v2 = vle(src2_h + w, vl);
+
+                vse(dst_h + w, __riscv_vfmul(v1, common::__riscv_vfrec(v2, vl), vl), vl);
+            }
+        }
+    } else {
+        for (int h = 0; h < height; h++) {
+            const float *src1_h = reinterpret_cast<const float*>((const uchar*)src1 + h * step1);
+            const float *src2_h = reinterpret_cast<const float*>((const uchar*)src2 + h * step2);
+            float *dst_h = reinterpret_cast<float*>((uchar*)dst + h * step);
+
+            int vl;
+            for (int w = 0; w < width; w += vl) {
+                vl = setvl(width - w);
+
+                auto v1 = vle(src1_h + w, vl);
+                auto v2 = vle(src2_h + w, vl);
+
+                vse(dst_h + w, __riscv_vfmul(v1, __riscv_vfmul(common::__riscv_vfrec(v2, vl), scale, vl), vl), vl);
+            }
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+template <typename ST> inline
+int recip(const ST *src_data, size_t src_step, ST *dst_data, size_t dst_step,
+          int width, int height, float scale) {
+    if (scale == 0.f || (scale < 1.f && scale > -1.f)) {
+        for (int h = 0; h < height; h++) {
+            ST *dst_h = reinterpret_cast<ST*>((uchar*)dst_data + h * dst_step);
+            std::memset(dst_h, 0, sizeof(ST) * width);
+        }
+        return CV_HAL_ERROR_OK;
+    }
+
+    for (int h = 0; h < height; h++) {
+        const ST *src_h = reinterpret_cast<const ST*>((const uchar*)src_data + h * src_step);
+        ST *dst_h = reinterpret_cast<ST*>((uchar*)dst_data + h * dst_step);
+
+        int vl;
+        for (int w = 0; w < width; w += vl) {
+            vl = setvl(width - w);
+
+            auto v = vle(src_h + w, vl);
+
+            auto mask = __riscv_vmseq(v, 0, vl);
+            vse(dst_h + w, __riscv_vmerge(recip_sat(v, scale, vl), 0, mask, vl), vl);
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+template <>
+int recip(const float *src_data, size_t src_step, float *dst_data, size_t dst_step,
+          int width, int height, float scale) {
+    if (scale == 0.f) {
+        for (int h = 0; h < height; h++) {
+            float *dst_h = reinterpret_cast<float*>((uchar*)dst_data + h * dst_step);
+            std::memset(dst_h, 0, sizeof(float) * width);
+        }
+        return CV_HAL_ERROR_OK;
+    }
+
+    if (std::fabs(scale - 1.f) < FLT_EPSILON) {
+        for (int h = 0; h < height; h++) {
+            const float *src_h = reinterpret_cast<const float*>((const uchar*)src_data + h * src_step);
+            float *dst_h = reinterpret_cast<float*>((uchar*)dst_data + h * dst_step);
+
+            int vl;
+            for (int w = 0; w < width; w += vl) {
+                vl = setvl(width - w);
+
+                auto v = vle(src_h + w, vl);
+
+                vse(dst_h + w, common::__riscv_vfrec(v, vl), vl);
+            }
+        }
+    } else {
+        for (int h = 0; h < height; h++) {
+            const float *src_h = reinterpret_cast<const float*>((const uchar*)src_data + h * src_step);
+            float *dst_h = reinterpret_cast<float*>((uchar*)dst_data + h * dst_step);
+
+            int vl;
+            for (int w = 0; w < width; w += vl) {
+                vl = setvl(width - w);
+
+                auto v = vle(src_h + w, vl);
+
+                vse(dst_h + w, __riscv_vfmul(common::__riscv_vfrec(v, vl), scale, vl), vl);
+            }
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+} // anonymous
+
+int div8u(const uchar *src1_data, size_t src1_step, const uchar *src2_data, size_t src2_step, uchar *dst_data, size_t dst_step, int width, int height, double scale) {
+    return div<uchar>(src1_data, src1_step, src2_data, src2_step, dst_data, dst_step, width, height, scale);
+}
+int div8s(const schar *src1_data, size_t src1_step, const schar *src2_data, size_t src2_step, schar *dst_data, size_t dst_step, int width, int height, double scale) {
+    return div<schar>(src1_data, src1_step, src2_data, src2_step, dst_data, dst_step, width, height, scale);
+}
+int div16u(const ushort *src1_data, size_t src1_step, const ushort *src2_data, size_t src2_step, ushort *dst_data, size_t dst_step, int width, int height, double scale) {
+    return div<ushort>(src1_data, src1_step, src2_data, src2_step, dst_data, dst_step, width, height, scale);
+}
+int div16s(const short *src1_data, size_t src1_step, const short *src2_data, size_t src2_step, short *dst_data, size_t dst_step, int width, int height, double scale) {
+    return div<short>(src1_data, src1_step, src2_data, src2_step, dst_data, dst_step, width, height, scale);
+}
+int div32s(const int *src1_data, size_t src1_step, const int *src2_data, size_t src2_step, int *dst_data, size_t dst_step, int width, int height, double scale) {
+    return div<int>(src1_data, src1_step, src2_data, src2_step, dst_data, dst_step, width, height, scale);
+}
+int div32f(const float *src1_data, size_t src1_step, const float *src2_data, size_t src2_step, float *dst_data, size_t dst_step, int width, int height, double scale) {
+    return div<float>(src1_data, src1_step, src2_data, src2_step, dst_data, dst_step, width, height, scale);
+}
+
+int recip8u(const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step, int width, int height, double scale) {
+    return recip<uchar>(src_data, src_step, dst_data, dst_step, width, height, scale);
+}
+int recip8s(const schar *src_data, size_t src_step, schar *dst_data, size_t dst_step, int width, int height, double scale) {
+    return recip<schar>(src_data, src_step, dst_data, dst_step, width, height, scale);
+}
+int recip16u(const ushort *src_data, size_t src_step, ushort *dst_data, size_t dst_step, int width, int height, double scale) {
+    return recip<ushort>(src_data, src_step, dst_data, dst_step, width, height, scale);
+}
+int recip16s(const short *src_data, size_t src_step, short *dst_data, size_t dst_step, int width, int height, double scale) {
+    return recip<short>(src_data, src_step, dst_data, dst_step, width, height, scale);
+}
+int recip32s(const int *src_data, size_t src_step, int *dst_data, size_t dst_step, int width, int height, double scale) {
+    return recip<int>(src_data, src_step, dst_data, dst_step, width, height, scale);
+}
+int recip32f(const float *src_data, size_t src_step, float *dst_data, size_t dst_step, int width, int height, double scale) {
+    return recip<float>(src_data, src_step, dst_data, dst_step, width, height, scale);
+}
+
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+}}} // cv::rvv_hal::core
--- a/hal/riscv-rvv/hal_rvv_1p0/dotprod.hpp
+++ b/hal/riscv-rvv/hal_rvv_1p0/dotprod.hpp
@ -5,21 +5,16 @@
 // Copyright (C) 2025, SpaceMIT Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.

-
-#ifndef OPENCV_HAL_RVV_DOTPROD_HPP_INCLUDED
-#define OPENCV_HAL_RVV_DOTPROD_HPP_INCLUDED
-
-#include <riscv_vector.h>
+#include "rvv_hal.hpp"
 #include <algorithm>

-namespace cv { namespace cv_hal_rvv { namespace dotprod {
+namespace cv { namespace rvv_hal { namespace core {

-#undef cv_hal_dotProduct
-#define cv_hal_dotProduct cv::cv_hal_rvv::dotprod::dotprod
+#if CV_HAL_RVV_1P0_ENABLED

 namespace {

-double dotProd_8u(const uchar *a, const uchar *b, int len) {
+static inline double dotProd_8u(const uchar *a, const uchar *b, int len) {
    constexpr int block_size0 = (1 << 15);

    double r = 0;
@ -47,7 +42,7 @@ double dotProd_8u(const uchar *a, const uchar *b, int len) {
    return r;
 }

-double dotProd_8s(const schar *a, const schar *b, int len) {
+static inline double dotProd_8s(const schar *a, const schar *b, int len) {
    constexpr int block_size0 = (1 << 14);

    double r = 0;
@ -75,7 +70,7 @@ double dotProd_8s(const schar *a, const schar *b, int len) {
    return r;
 }

-double dotProd_16u(const ushort *a, const ushort *b, int len) {
+static inline double dotProd_16u(const ushort *a, const ushort *b, int len) {
    constexpr int block_size0 = (1 << 24);

    double r = 0;
@ -103,7 +98,7 @@ double dotProd_16u(const ushort *a, const ushort *b, int len) {
    return r;
 }

-double dotProd_16s(const short *a, const short *b, int len) {
+static inline double dotProd_16s(const short *a, const short *b, int len) {
    constexpr int block_size0 = (1 << 24);

    double r = 0;
@ -131,7 +126,7 @@ double dotProd_16s(const short *a, const short *b, int len) {
    return r;
 }

-double dotProd_32s(const int *a, const int *b, int len) {
+static inline double dotProd_32s(const int *a, const int *b, int len) {
    double r = 0;

    vfloat64m8_t s = __riscv_vfmv_v_f_f64m8(0.f, __riscv_vsetvlmax_e64m8());
@ -149,7 +144,7 @@ double dotProd_32s(const int *a, const int *b, int len) {
    return r;
 }

-double dotProd_32f(const float *a, const float *b, int len) {
+static inline double dotProd_32f(const float *a, const float *b, int len) {
    constexpr int block_size0 = (1 << 11);

    double r = 0.f;
@ -180,7 +175,7 @@ double dotProd_32f(const float *a, const float *b, int len) {
 } // anonymous

 using DotProdFunc = double (*)(const uchar *a, const uchar *b, int len);
-inline int dotprod(const uchar *a_data, size_t a_step, const uchar *b_data, size_t b_step,
+int dotprod(const uchar *a_data, size_t a_step, const uchar *b_data, size_t b_step,
            int width, int height, int type, double *dot_val) {
    int depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);

@ -195,16 +190,9 @@ inline int dotprod(const uchar *a_data, size_t a_step, const uchar *b_data, size
        return CV_HAL_ERROR_NOT_IMPLEMENTED;
    }

-    static const size_t elem_size_tab[CV_DEPTH_MAX] = {
-        sizeof(uchar),   sizeof(schar),
-        sizeof(ushort),  sizeof(short),
-        sizeof(int),     sizeof(float),
-        sizeof(int64_t), 0,
-    };
-    CV_Assert(elem_size_tab[depth]);
-
-    bool a_continuous = (a_step == width * elem_size_tab[depth] * cn);
-    bool b_continuous = (b_step == width * elem_size_tab[depth] * cn);
+    size_t elem_size1 = static_cast<size_t>(CV_ELEM_SIZE1(type));
+    bool a_continuous = (a_step == width * elem_size1 * cn);
+    bool b_continuous = (b_step == width * elem_size1 * cn);
    size_t nplanes = 1;
    size_t len = width * height;
    if (!a_continuous || !b_continuous) {
@ -228,6 +216,6 @@ inline int dotprod(const uchar *a_data, size_t a_step, const uchar *b_data, size
    return CV_HAL_ERROR_OK;
 }

-}}} // cv::cv_hal_rvv::dotprod
+#endif // CV_HAL_RVV_1P0_ENABLED

-#endif // OPENCV_HAL_RVV_DOTPROD_HPP_INCLUDED
+}}} // cv::rvv_hal::core
--- a/hal/riscv-rvv/hal_rvv_1p0/dxt.hpp
+++ b/hal/riscv-rvv/hal_rvv_1p0/dxt.hpp
@ -4,17 +4,11 @@

 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.

-#ifndef OPENCV_HAL_RVV_DXT_HPP_INCLUDED
-#define OPENCV_HAL_RVV_DXT_HPP_INCLUDED
+#include "rvv_hal.hpp"

-#include <riscv_vector.h>
-#include "hal_rvv_1p0/types.hpp"
-#include "opencv2/core/types.hpp"
+namespace cv { namespace rvv_hal { namespace core {

-namespace cv { namespace cv_hal_rvv { namespace dxt {
-
-#undef cv_hal_dft
-#define cv_hal_dft cv::cv_hal_rvv::dxt::dft
+#if CV_HAL_RVV_1P0_ENABLED

 template<typename T> struct rvv;

@ -545,8 +539,8 @@ inline int dft(const Complex<T>* src, Complex<T>* dst, int nf, int *factors, T s
    return CV_HAL_ERROR_OK;
 }

-inline int dft(const uchar* src, uchar* dst, int depth, int nf, int *factors, double scale, int* itab, void* wave,
-                  int tab_size, int n, bool isInverse, bool noPermute)
+int dft(const uchar* src, uchar* dst, int depth, int nf, int *factors, double scale,
+        int* itab, void* wave, int tab_size, int n, bool isInverse, bool noPermute)
 {
    if( n == 0 )
        return CV_HAL_ERROR_OK;
@ -563,6 +557,6 @@ inline int dft(const uchar* src, uchar* dst, int depth, int nf, int *factors, do
    return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }

-}}}
+#endif // CV_HAL_RVV_1P0_ENABLED

-#endif
+}}} // cv::rvv_hal::core
--- a/hal/riscv-rvv/hal_rvv_1p0/exp.hpp
+++ b/hal/riscv-rvv/hal_rvv_1p0/exp.hpp
@ -4,17 +4,11 @@

 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.

-#ifndef OPENCV_HAL_RVV_EXP_HPP_INCLUDED
-#define OPENCV_HAL_RVV_EXP_HPP_INCLUDED
+#include "rvv_hal.hpp"

-#include <riscv_vector.h>
+namespace cv { namespace rvv_hal { namespace core {

-namespace cv { namespace cv_hal_rvv {
-
-#undef cv_hal_exp32f
-#define cv_hal_exp32f cv::cv_hal_rvv::exp32f
-#undef cv_hal_exp64f
-#define cv_hal_exp64f cv::cv_hal_rvv::exp64f
+#if CV_HAL_RVV_1P0_ENABLED

 namespace detail {

@ -116,7 +110,7 @@ static constexpr double exp_tab_64f[exp_tab_size] = EXP_TAB_VALUE;

 }  // namespace detail

-inline int exp32f(const float* src, float* dst, int _len)
+int exp32f(const float* src, float* dst, int _len)
 {
    size_t vl = __riscv_vsetvlmax_e32m4();
    auto exp_a2 = __riscv_vfmv_v_f_f32m4(detail::exp32f_a2, vl);
@ -158,7 +152,7 @@ inline int exp32f(const float* src, float* dst, int _len)
    return CV_HAL_ERROR_OK;
 }

-inline int exp64f(const double* src, double* dst, int _len)
+int exp64f(const double* src, double* dst, int _len)
 {
    size_t vl = __riscv_vsetvlmax_e64m4();
    // all vector registers are used up, so not load more constants
@ -203,6 +197,6 @@ inline int exp64f(const double* src, double* dst, int _len)
    return CV_HAL_ERROR_OK;
 }

-}}  // namespace cv::cv_hal_rvv
+#endif // CV_HAL_RVV_1P0_ENABLED

-#endif //OPENCV_HAL_RVV_EXP_HPP_INCLUDED
+}}}  // cv::rvv_hal::core
--- a/hal/riscv-rvv/hal_rvv_1p0/flip.hpp
+++ b/hal/riscv-rvv/hal_rvv_1p0/flip.hpp
@ -5,13 +5,7 @@
 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
 // Copyright (C) 2025, SpaceMIT Inc., all rights reserved.

-#ifndef OPENCV_HAL_RVV_FLIP_HPP_INCLUDED
-#define OPENCV_HAL_RVV_FLIP_HPP_INCLUDED
-
-
-#include <riscv_vector.h>
-#include <opencv2/core/base.hpp>
-#include "hal_rvv_1p0/types.hpp"
+#include "rvv_hal.hpp"

 #if defined (__clang__) && __clang_major__ < 18
 #define OPENCV_HAL_IMPL_RVV_VCREATE_x3(suffix, width, v0, v1, v2) \
@ -24,10 +18,9 @@
 #define __riscv_vcreate_v_u64m2x3(v0, v1, v2) OPENCV_HAL_IMPL_RVV_VCREATE_x3(u64, 2, v0, v1, v2)
 #endif

-namespace cv { namespace cv_hal_rvv {
+namespace cv { namespace rvv_hal { namespace core {

-#undef cv_hal_flip
-#define cv_hal_flip cv::cv_hal_rvv::flip
+#if CV_HAL_RVV_1P0_ENABLED

 namespace {

@ -73,6 +66,13 @@ CV_HAL_RVV_FLIP_INPLACE_C1(16UC1, ushort, RVV_U16M8)
 CV_HAL_RVV_FLIP_INPLACE_C1(32UC1, unsigned, RVV_U32M8)
 CV_HAL_RVV_FLIP_INPLACE_C1(64UC1, uint64_t, RVV_U64M8)

+// Suppress warnings of "ignoring attributes applied to VecType after definition",
+// VecType is vuint8m2x3_t, vuint16m2x3_t, vuint32m2x3_t or vuint64m2x3_t
+#if defined (__GNUC__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wattributes"
+#endif
+
 #define CV_HAL_RVV_FLIP_C3_TYPES(width) \
 struct RVV_C3_U##width##M2 : RVV_U##width##M2 { \
    static inline vuint##width##m2x3_t vload3(const uint##width##_t *base, size_t vl) { return __riscv_vlseg3e##width##_v_u##width##m2x3(base, vl); } \
@ -90,6 +90,10 @@ CV_HAL_RVV_FLIP_C3_TYPES(16)
 CV_HAL_RVV_FLIP_C3_TYPES(32)
 CV_HAL_RVV_FLIP_C3_TYPES(64)

+#if defined (__GNUC__)
+#pragma GCC diagnostic pop
+#endif
+
 #define CV_HAL_RVV_FLIP_C3(name, _Tps, RVV) \
 inline void flip_##name(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int src_width, int src_height, int flip_mode) { \
    for (int h = 0; h < src_height; h++) { \
@ -302,7 +306,7 @@ inline int flip_inplace(int esz, uchar* data, size_t step, int width, int height
        0, 0, 0, 0,
        0
    };
-    FlipInplaceFunc func = flip_inplace_func_tab[esz];
+    FlipInplaceFunc func = esz <= 32 ? flip_inplace_func_tab[esz] : nullptr;
    if (!func) {
        return CV_HAL_ERROR_NOT_IMPLEMENTED;
    }
@ -311,7 +315,7 @@ inline int flip_inplace(int esz, uchar* data, size_t step, int width, int height
    return CV_HAL_ERROR_OK;
 }

-inline int flip(int src_type, const uchar* src_data, size_t src_step, int src_width, int src_height,
+int flip(int src_type, const uchar* src_data, size_t src_step, int src_width, int src_height,
                uchar* dst_data, size_t dst_step, int flip_mode)
 {
    int esz = CV_ELEM_SIZE(src_type);
@ -344,7 +348,7 @@ inline int flip(int src_type, const uchar* src_data, size_t src_step, int src_wi
        0, 0, 0, 0,
        0
    };
-    FlipFunc func = flip_func_tab[esz];
+    FlipFunc func = esz <= 32 ? flip_func_tab[esz] : nullptr;
    if (func) {
        func(src_data, src_step, dst_data, dst_step, src_width, src_height, flip_mode);
        return CV_HAL_ERROR_OK;
@ -368,6 +372,6 @@ inline int flip(int src_type, const uchar* src_data, size_t src_step, int src_wi
    return CV_HAL_ERROR_OK;
 }

-}}  // namespace cv::cv_hal_rvv
+#endif // CV_HAL_RVV_1P0_ENABLED

-#endif //OPENCV_HAL_RVV_FLIP_HPP_INCLUDED
+}}}  // cv::rvv_hal::core
--- a/hal/riscv-rvv/hal_rvv_1p0/log.hpp
+++ b/hal/riscv-rvv/hal_rvv_1p0/log.hpp
@ -4,17 +4,11 @@

 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.

-#ifndef OPENCV_HAL_RVV_LOG_HPP_INCLUDED
-#define OPENCV_HAL_RVV_LOG_HPP_INCLUDED
+#include "rvv_hal.hpp"

-#include <riscv_vector.h>
+namespace cv { namespace rvv_hal { namespace core {

-namespace cv { namespace cv_hal_rvv {
-
-#undef cv_hal_log32f
-#define cv_hal_log32f cv::cv_hal_rvv::log32f
-#undef cv_hal_log64f
-#define cv_hal_log64f cv::cv_hal_rvv::log64f
+#if CV_HAL_RVV_1P0_ENABLED

 namespace detail {

@ -306,7 +300,7 @@ static constexpr double log_tab_64f[log_tab_size] = LOG_TAB_VALUE;

 }  // namespace detail

-inline int log32f(const float* src, float* dst, int _len)
+int log32f(const float* src, float* dst, int _len)
 {
    size_t vl = __riscv_vsetvlmax_e32m4();
    auto log_a2 = __riscv_vfmv_v_f_f32m4(detail::log32f_a2, vl);
@ -340,7 +334,7 @@ inline int log32f(const float* src, float* dst, int _len)
    return CV_HAL_ERROR_OK;
 }

-inline int log64f(const double* src, double* dst, int _len)
+int log64f(const double* src, double* dst, int _len)
 {
    size_t vl = __riscv_vsetvlmax_e64m4();
    // all vector registers are used up, so not load more constants
@ -382,6 +376,6 @@ inline int log64f(const double* src, double* dst, int _len)
    return CV_HAL_ERROR_OK;
 }

-}}  // namespace cv::cv_hal_rvv
+#endif // CV_HAL_RVV_1P0_ENABLED

-#endif //OPENCV_HAL_RVV_LOG_HPP_INCLUDED
+}}}  // cv::rvv_hal::core
--- a/hal/riscv-rvv/hal_rvv_1p0/lu.hpp
+++ b/hal/riscv-rvv/hal_rvv_1p0/lu.hpp
@ -4,21 +4,16 @@

 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.

-#ifndef OPENCV_HAL_RVV_LU_HPP_INCLUDED
-#define OPENCV_HAL_RVV_LU_HPP_INCLUDED
-
+#include "rvv_hal.hpp"
 #include <cfloat>
 #include <cmath>
 #include <typeinfo>
-#include <riscv_vector.h>
-#include "hal_rvv_1p0/types.hpp"

-namespace cv { namespace cv_hal_rvv { namespace lu {
+namespace cv { namespace rvv_hal { namespace core {

-#undef cv_hal_LU32f
-#define cv_hal_LU32f cv::cv_hal_rvv::lu::LU<cv::cv_hal_rvv::RVV_F32M4>
-#undef cv_hal_LU64f
-#define cv_hal_LU64f cv::cv_hal_rvv::lu::LU<cv::cv_hal_rvv::RVV_F64M4>
+#if CV_HAL_RVV_1P0_ENABLED
+
+namespace {

 // the algorithm is copied from core/src/matrix_decomp.cpp,
 // in the function template static int cv::LUImpl
@ -167,6 +162,15 @@ inline int LU(T* src1, size_t src1_step, int m, T* src2, size_t src2_step, int n
    return CV_HAL_ERROR_OK;
 }

-}}}
+} // anonymous

-#endif
+int LU32f(float* src1, size_t src1_step, int m, float* src2, size_t src2_step, int n, int* info) {
+    return LU<RVV_F32M4>(src1, src1_step, m, src2, src2_step, n, info);
+}
+int LU64f(double* src1, size_t src1_step, int m, double* src2, size_t src2_step, int n, int* info) {
+    return LU<RVV_F64M4>(src1, src1_step, m, src2, src2_step, n, info);
+}
+
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+}}} // cv::rvv_hal::core
--- a/hal/riscv-rvv/hal_rvv_1p0/lut.hpp
+++ b/hal/riscv-rvv/hal_rvv_1p0/lut.hpp
@ -4,19 +4,11 @@

 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.

-#ifndef OPENCV_HAL_RVV_LUT_HPP_INCLUDED
-#define OPENCV_HAL_RVV_LUT_HPP_INCLUDED
+#include "rvv_hal.hpp"

-#include <riscv_vector.h>
-#include <opencv2/core/base.hpp>
-#include <opencv2/core/utility.hpp>
+namespace cv { namespace rvv_hal { namespace core {

-#include "hal_rvv_1p0/types.hpp"
-
-namespace cv { namespace cv_hal_rvv {
-
-#undef cv_hal_lut
-#define cv_hal_lut cv::cv_hal_rvv::lut
+#if CV_HAL_RVV_1P0_ENABLED

 // need vlen >= 256
 struct LUTCacheU8 : RVV_U8M8
@ -135,7 +127,7 @@ private:
    LUTParallelBody& operator=(const LUTParallelBody&);
 };

-inline int lut(const uchar* src_data,
+int lut(const uchar* src_data,
               size_t src_step,
               size_t src_type,
               const uchar* lut_data,
@ -191,6 +183,6 @@ inline int lut(const uchar* src_data,
    return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }

-}}  // namespace cv::cv_hal_rvv
+#endif // CV_HAL_RVV_1P0_ENABLED

-#endif //OPENCV_HAL_RVV_LUT_HPP_INCLUDED
+}}}  // cv::rvv_hal::core
--- a/hal/riscv-rvv/hal_rvv_1p0/magnitude.hpp
+++ b/hal/riscv-rvv/hal_rvv_1p0/magnitude.hpp
@ -4,20 +4,14 @@

 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.

-#ifndef OPENCV_HAL_RVV_MAGNITUDE_HPP_INCLUDED
-#define OPENCV_HAL_RVV_MAGNITUDE_HPP_INCLUDED
+#include "rvv_hal.hpp"
+#include "common.hpp"

-#include <riscv_vector.h>
+namespace cv { namespace rvv_hal { namespace core {

-#include "hal_rvv_1p0/sqrt.hpp"
-#include "hal_rvv_1p0/types.hpp"
+#if CV_HAL_RVV_1P0_ENABLED

-namespace cv { namespace cv_hal_rvv {
-
-#undef cv_hal_magnitude32f
-#define cv_hal_magnitude32f cv::cv_hal_rvv::magnitude<cv::cv_hal_rvv::Sqrt32f<cv::cv_hal_rvv::RVV_F32M8>>
-#undef cv_hal_magnitude64f
-#define cv_hal_magnitude64f cv::cv_hal_rvv::magnitude<cv::cv_hal_rvv::Sqrt64f<cv::cv_hal_rvv::RVV_F64M8>>
+namespace {

 template <typename SQRT_T, typename T = typename SQRT_T::T::ElemType>
 inline int magnitude(const T* x, const T* y, T* dst, int len)
@ -30,13 +24,22 @@ inline int magnitude(const T* x, const T* y, T* dst, int len)
        auto vx = SQRT_T::T::vload(x, vl);
        auto vy = SQRT_T::T::vload(y, vl);

-        auto vmag = detail::sqrt<SQRT_T::iter_times>(__riscv_vfmadd(vx, vx, __riscv_vfmul(vy, vy, vl), vl), vl);
+        auto vmag = common::sqrt<SQRT_T::iter_times>(__riscv_vfmadd(vx, vx, __riscv_vfmul(vy, vy, vl), vl), vl);
        SQRT_T::T::vstore(dst, vmag, vl);
    }

    return CV_HAL_ERROR_OK;
 }

-}}  // namespace cv::cv_hal_rvv
+} // anonymous

-#endif  // OPENCV_HAL_RVV_MAGNITUDE_HPP_INCLUDED
+int magnitude32f(const float *x, const float *y, float *dst, int len) {
+    return magnitude<common::Sqrt32f<RVV_F32M8>>(x, y, dst, len);
+}
+int magnitude64f(const double *x, const double  *y, double *dst, int len) {
+    return magnitude<common::Sqrt64f<RVV_F64M8>>(x, y, dst, len);
+}
+
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+}}}  // cv::rvv_hal::core
--- a/hal/riscv-rvv/hal_rvv_1p0/mean.hpp
+++ b/hal/riscv-rvv/hal_rvv_1p0/mean.hpp
@ -4,15 +4,11 @@

 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.

-#ifndef OPENCV_HAL_RVV_MEANSTDDEV_HPP_INCLUDED
-#define OPENCV_HAL_RVV_MEANSTDDEV_HPP_INCLUDED
+#include "rvv_hal.hpp"

-#include <riscv_vector.h>
+namespace cv { namespace rvv_hal { namespace core {

-namespace cv { namespace cv_hal_rvv {
-
-#undef cv_hal_meanStdDev
-#define cv_hal_meanStdDev cv::cv_hal_rvv::meanStdDev
+#if CV_HAL_RVV_1P0_ENABLED

 inline int meanStdDev_8UC1(const uchar* src_data, size_t src_step, int width, int height,
                            double* mean_val, double* stddev_val, uchar* mask, size_t mask_step);
@ -21,8 +17,8 @@ inline int meanStdDev_8UC4(const uchar* src_data, size_t src_step, int width, in
 inline int meanStdDev_32FC1(const uchar* src_data, size_t src_step, int width, int height,
                            double* mean_val, double* stddev_val, uchar* mask, size_t mask_step);

-inline int meanStdDev(const uchar* src_data, size_t src_step, int width, int height,
-                             int src_type, double* mean_val, double* stddev_val, uchar* mask, size_t mask_step) {
+int meanStdDev(const uchar* src_data, size_t src_step, int width, int height, int src_type,
+               double* mean_val, double* stddev_val, uchar* mask, size_t mask_step) {
    switch (src_type)
    {
    case CV_8UC1:
@ -226,6 +222,6 @@ inline int meanStdDev_32FC1(const uchar* src_data, size_t src_step, int width, i
    return CV_HAL_ERROR_OK;
 }

-}}
+#endif // CV_HAL_RVV_1P0_ENABLED

-#endif
+}}} // cv::rvv_hal::core
--- a/hal/riscv-rvv/hal_rvv_1p0/merge.hpp
+++ b/hal/riscv-rvv/hal_rvv_1p0/merge.hpp
@ -4,21 +4,7 @@

 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.

-#ifndef OPENCV_HAL_RVV_MERGE_HPP_INCLUDED
-#define OPENCV_HAL_RVV_MERGE_HPP_INCLUDED
-
-#include <riscv_vector.h>
-
-namespace cv { namespace cv_hal_rvv {
-
-#undef cv_hal_merge8u
-#define cv_hal_merge8u cv::cv_hal_rvv::merge8u
-#undef cv_hal_merge16u
-#define cv_hal_merge16u cv::cv_hal_rvv::merge16u
-#undef cv_hal_merge32s
-#define cv_hal_merge32s cv::cv_hal_rvv::merge32s
-#undef cv_hal_merge64s
-#define cv_hal_merge64s cv::cv_hal_rvv::merge64s
+#include "rvv_hal.hpp"

 #if defined __clang__ && __clang_major__ < 18
 #define OPENCV_HAL_IMPL_RVV_VCREATE_x2(suffix, width, v0, v1) \
@ -44,7 +30,11 @@ namespace cv { namespace cv_hal_rvv {
 #define __riscv_vcreate_v_u16m2x4(v0, v1, v2, v3) OPENCV_HAL_IMPL_RVV_VCREATE_x4(u16, 2, v0, v1, v2, v3)
 #endif  // clang < 18

-inline int merge8u(const uchar** src, uchar* dst, int len, int cn ) {
+namespace cv { namespace rvv_hal { namespace core {
+
+#if CV_HAL_RVV_1P0_ENABLED
+
+int merge8u(const uchar** src, uchar* dst, int len, int cn ) {
    int vl = 0;
    if (cn == 1)
    {
@ -129,7 +119,7 @@ inline int merge8u(const uchar** src, uchar* dst, int len, int cn ) {
    return CV_HAL_ERROR_OK;
 }

-inline int merge16u(const ushort** src, ushort* dst, int len, int cn ) {
+int merge16u(const ushort** src, ushort* dst, int len, int cn ) {
    int vl = 0;
    if (cn == 1)
    {
@ -217,7 +207,7 @@ inline int merge16u(const ushort** src, ushort* dst, int len, int cn ) {
 #if defined __GNUC__ && !defined(__clang__)
 __attribute__((optimize("no-tree-vectorize")))
 #endif
-inline int merge32s(const int** src, int* dst, int len, int cn ) {
+int merge32s(const int** src, int* dst, int len, int cn ) {
    int k = cn % 4 ? cn % 4 : 4;
    int i, j;
    if( k == 1 )
@ -287,7 +277,7 @@ inline int merge32s(const int** src, int* dst, int len, int cn ) {
 #if defined __GNUC__ && !defined(__clang__)
 __attribute__((optimize("no-tree-vectorize")))
 #endif
-inline int merge64s(const int64** src, int64* dst, int len, int cn ) {
+int merge64s(const int64** src, int64* dst, int len, int cn ) {
    int k = cn % 4 ? cn % 4 : 4;
    int i, j;
    if( k == 1 )
@ -354,6 +344,6 @@ inline int merge64s(const int64** src, int64* dst, int len, int cn ) {
    return CV_HAL_ERROR_OK;
 }

-}}
+#endif // CV_HAL_RVV_1P0_ENABLED

-#endif
+}}} // cv::rvv_hal::core
--- a/hal/riscv-rvv/hal_rvv_1p0/minmax.hpp
+++ b/hal/riscv-rvv/hal_rvv_1p0/minmax.hpp
@ -4,19 +4,11 @@

 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.

-#ifndef OPENCV_HAL_RVV_MINMAX_HPP_INCLUDED
-#define OPENCV_HAL_RVV_MINMAX_HPP_INCLUDED
+#include "rvv_hal.hpp"

-#include <riscv_vector.h>
-#include <opencv2/core/base.hpp>
-#include "hal_rvv_1p0/types.hpp"
+namespace cv { namespace rvv_hal { namespace core {

-namespace cv { namespace cv_hal_rvv { namespace minmax {
-
-#undef cv_hal_minMaxIdx
-#define cv_hal_minMaxIdx cv::cv_hal_rvv::minmax::minMaxIdx
-#undef cv_hal_minMaxIdxMaskStep
-#define cv_hal_minMaxIdxMaskStep cv::cv_hal_rvv::minmax::minMaxIdx
+#if CV_HAL_RVV_1P0_ENABLED

 template<typename VEC_T, typename BOOL_T, typename T = typename VEC_T::ElemType>
 inline int minMaxIdxReadTwice(const uchar* src_data, size_t src_step, int width, int height, double* minVal, double* maxVal,
@ -269,8 +261,8 @@ inline int minMaxIdxReadOnce(const uchar* src_data, size_t src_step, int width,
    return CV_HAL_ERROR_OK;
 }

-inline int minMaxIdx(const uchar* src_data, size_t src_step, int width, int height, int depth, double* minVal, double* maxVal,
-                     int* minIdx, int* maxIdx, uchar* mask, size_t mask_step = 0)
+int minMaxIdx(const uchar* src_data, size_t src_step, int width, int height, int depth,
+              double* minVal, double* maxVal, int* minIdx, int* maxIdx, uchar* mask, size_t mask_step)
 {
    if (!mask_step)
        mask_step = src_step;
@ -296,6 +288,6 @@ inline int minMaxIdx(const uchar* src_data, size_t src_step, int width, int heig
    return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }

-}}}
+#endif // CV_HAL_RVV_1P0_ENABLED

-#endif
+}}} // cv::rvv_hal::core
--- a/hal/riscv-rvv/hal_rvv_1p0/norm.hpp
+++ b/hal/riscv-rvv/hal_rvv_1p0/norm.hpp
@ -6,15 +6,12 @@
 // Copyright (C) 2025, SpaceMIT Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.

-#ifndef OPENCV_HAL_RVV_NORM_HPP_INCLUDED
-#define OPENCV_HAL_RVV_NORM_HPP_INCLUDED
-
+#include "rvv_hal.hpp"
 #include "common.hpp"

-namespace cv { namespace cv_hal_rvv { namespace norm {
+namespace cv { namespace rvv_hal { namespace core {

-#undef cv_hal_norm
-#define cv_hal_norm cv::cv_hal_rvv::norm::norm
+#if CV_HAL_RVV_1P0_ENABLED

 namespace {

@ -76,7 +73,7 @@ struct NormInf_RVV<schar, int> {
        for (int i = 0; i < n; i += vl) {
            vl = __riscv_vsetvl_e8m8(n - i);
            auto v = __riscv_vle8_v_i8m8(src + i, vl);
-            s = __riscv_vmaxu_tu(s, s, custom_intrin::__riscv_vabs(v, vl), vl);
+            s = __riscv_vmaxu_tu(s, s, common::__riscv_vabs(v, vl), vl);
        }
        return __riscv_vmv_x(__riscv_vredmaxu(s, __riscv_vmv_s_x_u8m1(0, __riscv_vsetvlmax_e8m1()), vlmax));
    }
@ -106,7 +103,7 @@ struct NormInf_RVV<short, int> {
        for (int i = 0; i < n; i += vl) {
            vl = __riscv_vsetvl_e16m8(n - i);
            auto v = __riscv_vle16_v_i16m8(src + i, vl);
-            s = __riscv_vmaxu_tu(s, s, custom_intrin::__riscv_vabs(v, vl), vl);
+            s = __riscv_vmaxu_tu(s, s, common::__riscv_vabs(v, vl), vl);
        }
        return __riscv_vmv_x(__riscv_vredmaxu(s, __riscv_vmv_s_x_u16m1(0, __riscv_vsetvlmax_e16m1()), vlmax));
    }
@ -121,7 +118,7 @@ struct NormInf_RVV<int, int> {
        for (int i = 0; i < n; i += vl) {
            vl = __riscv_vsetvl_e32m8(n - i);
            auto v = __riscv_vle32_v_i32m8(src + i, vl);
-            s = __riscv_vmaxu_tu(s, s, custom_intrin::__riscv_vabs(v, vl), vl);
+            s = __riscv_vmaxu_tu(s, s, common::__riscv_vabs(v, vl), vl);
        }
        return __riscv_vmv_x(__riscv_vredmaxu(s, __riscv_vmv_s_x_u32m1(0, __riscv_vsetvlmax_e32m1()), vlmax));
    }
@ -180,7 +177,7 @@ struct NormL1_RVV<schar, int> {
        int vl;
        for (int i = 0; i < n; i += vl) {
            vl = __riscv_vsetvl_e8m8(n - i);
-            auto v = custom_intrin::__riscv_vabs(__riscv_vle8_v_i8m8(src + i, vl), vl);
+            auto v = common::__riscv_vabs(__riscv_vle8_v_i8m8(src + i, vl), vl);
            s = __riscv_vwredsumu(__riscv_vwredsumu_tu(zero, v, zero, vl), s, __riscv_vsetvlmax_e16m1());
        }
        return __riscv_vmv_x(s);
@ -208,7 +205,7 @@ struct NormL1_RVV<short, int> {
        int vl;
        for (int i = 0; i < n; i += vl) {
            vl = __riscv_vsetvl_e16m8(n - i);
-            auto v = custom_intrin::__riscv_vabs(__riscv_vle16_v_i16m8(src + i, vl), vl);
+            auto v = common::__riscv_vabs(__riscv_vle16_v_i16m8(src + i, vl), vl);
            s = __riscv_vwredsumu(v, s, vl);
        }
        return __riscv_vmv_x(s);
@ -223,7 +220,7 @@ struct NormL1_RVV<int, double> {
        int vl;
        for (int i = 0; i < n; i += vl) {
            vl = __riscv_vsetvl_e32m4(n - i);
-            auto v = custom_intrin::__riscv_vabs(__riscv_vle32_v_i32m4(src + i, vl), vl);
+            auto v = common::__riscv_vabs(__riscv_vle32_v_i32m4(src + i, vl), vl);
            s = __riscv_vfadd_tu(s, s, __riscv_vfwcvt_f(v, vl), vl);
        }
        return __riscv_vfmv_f(__riscv_vfredosum(s, __riscv_vfmv_s_f_f64m1(0, __riscv_vsetvlmax_e64m1()), vlmax));
@ -544,7 +541,7 @@ struct MaskedNormInf_RVV<schar, int> {
                auto v = __riscv_vlse8_v_i8m8(src + cn * i + cn_index, sizeof(schar) * cn, vl);
                auto m = __riscv_vle8_v_u8m8(mask + i, vl);
                auto b = __riscv_vmsne(m, 0, vl);
-                s = __riscv_vmaxu_tumu(b, s, s, custom_intrin::__riscv_vabs(v, vl), vl);
+                s = __riscv_vmaxu_tumu(b, s, s, common::__riscv_vabs(v, vl), vl);
            }
        }
        return __riscv_vmv_x(__riscv_vredmaxu(s, __riscv_vmv_s_x_u8m1(0, __riscv_vsetvlmax_e8m1()), vlmax));
@ -560,7 +557,7 @@ struct MaskedNormL1_RVV<schar, int> {
            int vl;
            for (int i = 0; i < len; i += vl) {
                vl = __riscv_vsetvl_e8m8(len - i);
-                auto v = custom_intrin::__riscv_vabs(__riscv_vlse8_v_i8m8(src + cn * i + cn_index, sizeof(schar) * cn, vl), vl);
+                auto v = common::__riscv_vabs(__riscv_vlse8_v_i8m8(src + cn * i + cn_index, sizeof(schar) * cn, vl), vl);
                auto m = __riscv_vle8_v_u8m8(mask + i, vl);
                auto b = __riscv_vmsne(m, 0, vl);
                s = __riscv_vwredsumu(__riscv_vwredsumu_tum(b, zero, v, zero, vl), s, __riscv_vsetvlmax_e16m1());
@ -657,7 +654,7 @@ struct MaskedNormInf_RVV<short, int> {
                auto v = __riscv_vlse16_v_i16m8(src + cn * i + cn_index, sizeof(short) * cn, vl);
                auto m = __riscv_vle8_v_u8m4(mask + i, vl);
                auto b = __riscv_vmsne(m, 0, vl);
-                s = __riscv_vmaxu_tumu(b, s, s, custom_intrin::__riscv_vabs(v, vl), vl);
+                s = __riscv_vmaxu_tumu(b, s, s, common::__riscv_vabs(v, vl), vl);
            }
        }
        return __riscv_vmv_x(__riscv_vredmaxu(s, __riscv_vmv_s_x_u16m1(0, __riscv_vsetvlmax_e16m1()), vlmax));
@ -672,7 +669,7 @@ struct MaskedNormL1_RVV<short, int> {
            int vl;
            for (int i = 0; i < len; i += vl) {
                vl = __riscv_vsetvl_e8m4(len - i);
-                auto v = custom_intrin::__riscv_vabs(__riscv_vlse16_v_i16m8(src + cn * i + cn_index, sizeof(short) * cn, vl), vl);
+                auto v = common::__riscv_vabs(__riscv_vlse16_v_i16m8(src + cn * i + cn_index, sizeof(short) * cn, vl), vl);
                auto m = __riscv_vle8_v_u8m4(mask + i, vl);
                auto b = __riscv_vmsne(m, 0, vl);
                s = __riscv_vwredsumu_tum(b, s, v, s, vl);
@ -714,7 +711,7 @@ struct MaskedNormInf_RVV<int, int> {
                auto v = __riscv_vlse32_v_i32m8(src + cn * i + cn_index, sizeof(int) * cn, vl);
                auto m = __riscv_vle8_v_u8m2(mask + i, vl);
                auto b = __riscv_vmsne(m, 0, vl);
-                s = __riscv_vmaxu_tumu(b, s, s, custom_intrin::__riscv_vabs(v, vl), vl);
+                s = __riscv_vmaxu_tumu(b, s, s, common::__riscv_vabs(v, vl), vl);
            }
        }
        return __riscv_vmv_x(__riscv_vredmaxu(s, __riscv_vmv_s_x_u32m1(0, __riscv_vsetvlmax_e32m1()), vlmax));
@ -733,7 +730,7 @@ struct MaskedNormL1_RVV<int, double> {
                auto v = __riscv_vlse32_v_i32m4(src + cn * i + cn_index, sizeof(int) * cn, vl);
                auto m = __riscv_vle8_v_u8m1(mask + i, vl);
                auto b = __riscv_vmsne(m, 0, vl);
-                s = __riscv_vfadd_tumu(b, s, s, __riscv_vfwcvt_f(b, custom_intrin::__riscv_vabs(v, vl), vl), vl);
+                s = __riscv_vfadd_tumu(b, s, s, __riscv_vfwcvt_f(b, common::__riscv_vabs(v, vl), vl), vl);
            }
        }
        return __riscv_vfmv_f(__riscv_vfredosum(s, __riscv_vfmv_s_f_f64m1(0, __riscv_vsetvlmax_e64m1()), vlmax));
@ -972,8 +969,8 @@ CV_HAL_RVV_DEF_NORM_ALL(64f, double, double, double, double)
 }

 using NormFunc = int (*)(const uchar*, const uchar*, uchar*, int, int);
-inline int norm(const uchar* src, size_t src_step, const uchar* mask, size_t mask_step, int width,
-                int height, int type, int norm_type, double* result) {
+int norm(const uchar* src, size_t src_step, const uchar* mask, size_t mask_step,
+         int width, int height, int type, int norm_type, double* result) {
    int depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);

    if (result == nullptr || depth == CV_16F || norm_type > NORM_L2SQR) {
@ -1004,18 +1001,8 @@ inline int norm(const uchar* src, size_t src_step, const uchar* mask, size_t mas
        },
    };

-    static const size_t elem_size_tab[CV_DEPTH_MAX] = {
-        sizeof(uchar),      sizeof(schar),
-        sizeof(ushort),     sizeof(short),
-        sizeof(int),        sizeof(float),
-        sizeof(double),     sizeof(cv::hfloat),
-        sizeof(cv::bfloat), sizeof(bool),
-        sizeof(uint64_t),   sizeof(int64_t),
-        sizeof(unsigned),   0,
-    };
-    CV_Assert(elem_size_tab[depth]);
-
-    bool src_continuous = (src_step == width * elem_size_tab[depth] * cn || (src_step != width * elem_size_tab[depth] * cn && height == 1));
+    size_t elem_size1 = static_cast<size_t>(CV_ELEM_SIZE1(type));
+    bool src_continuous = (src_step == width * elem_size1 * cn || (src_step != width * elem_size1 * cn && height == 1));
    bool mask_continuous = (mask_step == static_cast<size_t>(width));
    size_t nplanes = 1;
    size_t size = width * height;
@ -1038,7 +1025,7 @@ inline int norm(const uchar* src, size_t src_step, const uchar* mask, size_t mas
    res.d = 0;
    if ((norm_type == NORM_L1 && depth <= CV_16S) ||
        ((norm_type == NORM_L2 || norm_type == NORM_L2SQR) && depth <= CV_8S)) {
-        const size_t esz = elem_size_tab[depth] * cn;
+        const size_t esz = elem_size1 * cn;
        const int total = (int)size;
        const int intSumBlockSize = (norm_type == NORM_L1 && depth <= CV_8S ? (1 << 23) : (1 << 15))/cn;
        const int blockSize = std::min(total, intSumBlockSize);
@ -1095,6 +1082,6 @@ inline int norm(const uchar* src, size_t src_step, const uchar* mask, size_t mas
    return CV_HAL_ERROR_OK;
 }

-}}} // cv::cv_hal_rvv::norm
+#endif // CV_HAL_RVV_1P0_ENABLED

-#endif
+}}} // cv::rvv_hal::core
--- a/hal/riscv-rvv/hal_rvv_1p0/norm_diff.hpp
+++ b/hal/riscv-rvv/hal_rvv_1p0/norm_diff.hpp
@ -6,15 +6,12 @@
 // Copyright (C) 2025, SpaceMIT Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.

-#ifndef OPENCV_HAL_RVV_NORM_DIFF_HPP_INCLUDED
-#define OPENCV_HAL_RVV_NORM_DIFF_HPP_INCLUDED
-
+#include "rvv_hal.hpp"
 #include "common.hpp"

-namespace cv { namespace cv_hal_rvv { namespace norm_diff {
+namespace cv { namespace rvv_hal { namespace core {

-#undef cv_hal_normDiff
-#define cv_hal_normDiff cv::cv_hal_rvv::norm_diff::normDiff
+#if CV_HAL_RVV_1P0_ENABLED

 namespace {

@ -64,7 +61,7 @@ struct NormDiffInf_RVV<uchar, int> {
            vl = __riscv_vsetvl_e8m8(n - i);
            auto v1 = __riscv_vle8_v_u8m8(src1 + i, vl);
            auto v2 = __riscv_vle8_v_u8m8(src2 + i, vl);
-            auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+            auto v = common::__riscv_vabd(v1, v2, vl);
            s = __riscv_vmaxu_tu(s, s, v, vl);
        }
        return __riscv_vmv_x(__riscv_vredmaxu(s, __riscv_vmv_s_x_u8m1(0, __riscv_vsetvlmax_e8m1()), vlmax));
@ -81,7 +78,7 @@ struct NormDiffInf_RVV<schar, int> {
            vl = __riscv_vsetvl_e8m8(n - i);
            auto v1 = __riscv_vle8_v_i8m8(src1 + i, vl);
            auto v2 = __riscv_vle8_v_i8m8(src2 + i, vl);
-            auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+            auto v = common::__riscv_vabd(v1, v2, vl);
            s = __riscv_vmaxu_tu(s, s, v, vl);
        }
        return __riscv_vmv_x(__riscv_vredmaxu(s, __riscv_vmv_s_x_u8m1(0, __riscv_vsetvlmax_e8m1()), vlmax));
@ -98,7 +95,7 @@ struct NormDiffInf_RVV<ushort, int> {
            vl = __riscv_vsetvl_e16m8(n - i);
            auto v1 = __riscv_vle16_v_u16m8(src1 + i, vl);
            auto v2 = __riscv_vle16_v_u16m8(src2 + i, vl);
-            auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+            auto v = common::__riscv_vabd(v1, v2, vl);
            s = __riscv_vmaxu_tu(s, s, v, vl);
        }
        return __riscv_vmv_x(__riscv_vredmaxu(s, __riscv_vmv_s_x_u16m1(0, __riscv_vsetvlmax_e16m1()), vlmax));
@ -115,7 +112,7 @@ struct NormDiffInf_RVV<short, int> {
            vl = __riscv_vsetvl_e16m8(n - i);
            auto v1 = __riscv_vle16_v_i16m8(src1 + i, vl);
            auto v2 = __riscv_vle16_v_i16m8(src2 + i, vl);
-            auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+            auto v = common::__riscv_vabd(v1, v2, vl);
            s = __riscv_vmaxu_tu(s, s, v, vl);
        }
        return __riscv_vmv_x(__riscv_vredmaxu(s, __riscv_vmv_s_x_u16m1(0, __riscv_vsetvlmax_e16m1()), vlmax));
@ -132,7 +129,8 @@ struct NormDiffInf_RVV<int, unsigned> {
            vl = __riscv_vsetvl_e32m8(n - i);
            auto v1 = __riscv_vle32_v_i32m8(src1 + i, vl);
            auto v2 = __riscv_vle32_v_i32m8(src2 + i, vl);
-            auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+            auto v = common::__riscv_vabd(v1, v2, vl); // 5.x
+            // auto v = common::__riscv_vabs(__riscv_vsub(v1, v2, vl), vl); // 4.x
            s = __riscv_vmaxu_tu(s, s, v, vl);
        }
        return __riscv_vmv_x(__riscv_vredmaxu(s, __riscv_vmv_s_x_u32m1(0, __riscv_vsetvlmax_e32m1()), vlmax));
@ -183,7 +181,7 @@ struct NormDiffL1_RVV<uchar, int> {
            vl = __riscv_vsetvl_e8m8(n - i);
            auto v1 = __riscv_vle8_v_u8m8(src1 + i, vl);
            auto v2 = __riscv_vle8_v_u8m8(src2 + i, vl);
-            auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+            auto v = common::__riscv_vabd(v1, v2, vl);
            s = __riscv_vwredsumu(__riscv_vwredsumu_tu(zero, v, zero, vl), s, __riscv_vsetvlmax_e16m1());
        }
        return __riscv_vmv_x(s);
@ -200,7 +198,7 @@ struct NormDiffL1_RVV<schar, int> {
            vl = __riscv_vsetvl_e8m8(n - i);
            auto v1 = __riscv_vle8_v_i8m8(src1 + i, vl);
            auto v2 = __riscv_vle8_v_i8m8(src2 + i, vl);
-            auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+            auto v = common::__riscv_vabd(v1, v2, vl);
            s = __riscv_vwredsumu(__riscv_vwredsumu_tu(zero, v, zero, vl), s, __riscv_vsetvlmax_e16m1());
        }
        return __riscv_vmv_x(s);
@ -216,7 +214,7 @@ struct NormDiffL1_RVV<ushort, int> {
            vl = __riscv_vsetvl_e16m8(n - i);
            auto v1 = __riscv_vle16_v_u16m8(src1 + i, vl);
            auto v2 = __riscv_vle16_v_u16m8(src2 + i, vl);
-            auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+            auto v = common::__riscv_vabd(v1, v2, vl);
            s = __riscv_vwredsumu(v, s, vl);
        }
        return __riscv_vmv_x(s);
@ -232,7 +230,7 @@ struct NormDiffL1_RVV<short, int> {
            vl = __riscv_vsetvl_e16m8(n - i);
            auto v1 = __riscv_vle16_v_i16m8(src1 + i, vl);
            auto v2 = __riscv_vle16_v_i16m8(src2 + i, vl);
-            auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+            auto v = common::__riscv_vabd(v1, v2, vl);
            s = __riscv_vwredsumu(v, s, vl);
        }
        return __riscv_vmv_x(s);
@ -249,7 +247,8 @@ struct NormDiffL1_RVV<int, double> {
            vl = __riscv_vsetvl_e32m4(n - i);
            auto v1 = __riscv_vle32_v_i32m4(src1 + i, vl);
            auto v2 = __riscv_vle32_v_i32m4(src2 + i, vl);
-            auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+            auto v = common::__riscv_vabd(v1, v2, vl); // 5.x
+            // auto v = common::__riscv_vabs(__riscv_vsub(v1, v2, vl), vl); // 4.x
            s = __riscv_vfadd_tu(s, s, __riscv_vfwcvt_f(v, vl), vl);
        }
        return __riscv_vfmv_f(__riscv_vfredosum(s, __riscv_vfmv_s_f_f64m1(0, __riscv_vsetvlmax_e64m1()), vlmax));
@ -299,7 +298,7 @@ struct NormDiffL2_RVV<uchar, int> {
            vl = __riscv_vsetvl_e8m4(n - i);
            auto v1 = __riscv_vle8_v_u8m4(src1 + i, vl);
            auto v2 = __riscv_vle8_v_u8m4(src2 + i, vl);
-            auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+            auto v = common::__riscv_vabd(v1, v2, vl);
            s = __riscv_vwredsumu(__riscv_vwmulu(v, v, vl), s, vl);
        }
        return __riscv_vmv_x(s);
@ -315,7 +314,7 @@ struct NormDiffL2_RVV<schar, int> {
            vl = __riscv_vsetvl_e8m4(n - i);
            auto v1 = __riscv_vle8_v_i8m4(src1 + i, vl);
            auto v2 = __riscv_vle8_v_i8m4(src2 + i, vl);
-            auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+            auto v = common::__riscv_vabd(v1, v2, vl);
            s = __riscv_vwredsumu(__riscv_vwmulu(v, v, vl), s, vl);
        }
        return __riscv_vmv_x(s);
@ -332,7 +331,7 @@ struct NormDiffL2_RVV<ushort, double> {
            vl = __riscv_vsetvl_e16m2(n - i);
            auto v1 = __riscv_vle16_v_u16m2(src1 + i, vl);
            auto v2 = __riscv_vle16_v_u16m2(src2 + i, vl);
-            auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+            auto v = common::__riscv_vabd(v1, v2, vl);
            auto v_mul = __riscv_vwmulu(v, v, vl);
            s = __riscv_vfadd_tu(s, s, __riscv_vfwcvt_f(v_mul, vl), vl);
        }
@ -350,7 +349,7 @@ struct NormDiffL2_RVV<short, double> {
            vl = __riscv_vsetvl_e16m2(n - i);
            auto v1 = __riscv_vle16_v_i16m2(src1 + i, vl);
            auto v2 = __riscv_vle16_v_i16m2(src2 + i, vl);
-            auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+            auto v = common::__riscv_vabd(v1, v2, vl);
            auto v_mul = __riscv_vwmulu(v, v, vl);
            s = __riscv_vfadd_tu(s, s, __riscv_vfwcvt_f(v_mul, vl), vl);
        }
@ -368,7 +367,7 @@ struct NormDiffL2_RVV<int, double> {
            vl = __riscv_vsetvl_e32m4(n - i);
            auto v1 = __riscv_vle32_v_i32m4(src1 + i, vl);
            auto v2 = __riscv_vle32_v_i32m4(src2 + i, vl);
-            auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+            auto v = common::__riscv_vabd(v1, v2, vl);
            auto v_mul = __riscv_vwmulu(v, v, vl);
            s = __riscv_vfadd_tu(s, s, __riscv_vfcvt_f(v_mul, vl), vl);
        }
@ -471,7 +470,7 @@ struct MaskedNormDiffInf_RVV<uchar, int> {
                vl = __riscv_vsetvl_e8m8(len - i);
                auto v1 = __riscv_vle8_v_u8m8(src1 + i, vl);
                auto v2 = __riscv_vle8_v_u8m8(src2 + i, vl);
-                auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+                auto v = common::__riscv_vabd(v1, v2, vl);
                auto m = __riscv_vle8_v_u8m8(mask + i, vl);
                auto b = __riscv_vmsne(m, 0, vl);
                s = __riscv_vmaxu_tumu(b, s, s, v, vl);
@ -482,7 +481,7 @@ struct MaskedNormDiffInf_RVV<uchar, int> {
                vl = __riscv_vsetvl_e8m2(len - i);
                auto v1 = __riscv_vle8_v_u8m8(src1 + i * 4, vl * 4);
                auto v2 = __riscv_vle8_v_u8m8(src2 + i * 4, vl * 4);
-                auto v = custom_intrin::__riscv_vabd(v1, v2, vl * 4);
+                auto v = common::__riscv_vabd(v1, v2, vl * 4);
                auto m = __riscv_vle8_v_u8m2(mask + i, vl);
                auto b = __riscv_vmsne(__riscv_vreinterpret_u8m8(__riscv_vmul(__riscv_vzext_vf4(__riscv_vminu(m, 1, vl), vl), 0x01010101, vl)), 0, vl * 4);
                s = __riscv_vmaxu_tumu(b, s, s, v, vl * 4);
@ -494,7 +493,7 @@ struct MaskedNormDiffInf_RVV<uchar, int> {
                    vl = __riscv_vsetvl_e8m8(len - i);
                    auto v1 = __riscv_vlse8_v_u8m8(src1 + cn * i + cn_index, sizeof(uchar) * cn, vl);
                    auto v2 = __riscv_vlse8_v_u8m8(src2 + cn * i + cn_index, sizeof(uchar) * cn, vl);
-                    auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+                    auto v = common::__riscv_vabd(v1, v2, vl);
                    auto m = __riscv_vle8_v_u8m8(mask + i, vl);
                    auto b = __riscv_vmsne(m, 0, vl);
                    s = __riscv_vmaxu_tumu(b, s, s, v, vl);
@ -516,7 +515,7 @@ struct MaskedNormDiffInf_RVV<schar, int> {
                vl = __riscv_vsetvl_e8m8(len - i);
                auto v1 = __riscv_vlse8_v_i8m8(src1 + cn * i + cn_index, sizeof(schar) * cn, vl);
                auto v2 = __riscv_vlse8_v_i8m8(src2 + cn * i + cn_index, sizeof(schar) * cn, vl);
-                auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+                auto v = common::__riscv_vabd(v1, v2, vl);
                auto m = __riscv_vle8_v_u8m8(mask + i, vl);
                auto b = __riscv_vmsne(m, 0, vl);
                s = __riscv_vmaxu_tumu(b, s, s, v, vl);
@ -537,7 +536,7 @@ struct MaskedNormDiffInf_RVV<ushort, int> {
                vl = __riscv_vsetvl_e16m8(len - i);
                auto v1 = __riscv_vlse16_v_u16m8(src1 + cn * i + cn_index, sizeof(ushort) * cn, vl);
                auto v2 = __riscv_vlse16_v_u16m8(src2 + cn * i + cn_index, sizeof(ushort) * cn, vl);
-                auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+                auto v = common::__riscv_vabd(v1, v2, vl);
                auto m = __riscv_vle8_v_u8m4(mask + i, vl);
                auto b = __riscv_vmsne(m, 0, vl);
                s = __riscv_vmaxu_tumu(b, s, s, v, vl);
@ -558,7 +557,7 @@ struct MaskedNormDiffInf_RVV<short, int> {
                vl = __riscv_vsetvl_e16m8(len - i);
                auto v1 = __riscv_vlse16_v_i16m8(src1 + cn * i + cn_index, sizeof(short) * cn, vl);
                auto v2 = __riscv_vlse16_v_i16m8(src2 + cn * i + cn_index, sizeof(short) * cn, vl);
-                auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+                auto v = common::__riscv_vabd(v1, v2, vl);
                auto m = __riscv_vle8_v_u8m4(mask + i, vl);
                auto b = __riscv_vmsne(m, 0, vl);
                s = __riscv_vmaxu_tumu(b, s, s, v, vl);
@ -579,7 +578,8 @@ struct MaskedNormDiffInf_RVV<int, unsigned> {
                vl = __riscv_vsetvl_e32m8(len - i);
                auto v1 = __riscv_vlse32_v_i32m8(src1 + cn * i + cn_index, sizeof(int) * cn, vl);
                auto v2 = __riscv_vlse32_v_i32m8(src2 + cn * i + cn_index, sizeof(int) * cn, vl);
-                auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+                auto v = common::__riscv_vabd(v1, v2, vl); // 5.x
+                // auto v = common::__riscv_vabs(__riscv_vsub(v1, v2, vl), vl); // 4.x
                auto m = __riscv_vle8_v_u8m2(mask + i, vl);
                auto b = __riscv_vmsne(m, 0, vl);
                s = __riscv_vmaxu_tumu(b, s, s, v, vl);
@ -655,7 +655,7 @@ struct MaskedNormDiffL1_RVV<uchar, int> {
                vl = __riscv_vsetvl_e8m8(len - i);
                auto v1 = __riscv_vle8_v_u8m8(src1 + i, vl);
                auto v2 = __riscv_vle8_v_u8m8(src2 + i, vl);
-                auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+                auto v = common::__riscv_vabd(v1, v2, vl);
                auto m = __riscv_vle8_v_u8m8(mask + i, vl);
                auto b = __riscv_vmsne(m, 0, vl);
                s = __riscv_vwredsumu(__riscv_vwredsumu_tum(b, zero, v, zero, vl), s, __riscv_vsetvlmax_e16m1());
@ -666,7 +666,7 @@ struct MaskedNormDiffL1_RVV<uchar, int> {
                vl = __riscv_vsetvl_e8m2(len - i);
                auto v1 = __riscv_vle8_v_u8m8(src1 + i * 4, vl * 4);
                auto v2 = __riscv_vle8_v_u8m8(src2 + i * 4, vl * 4);
-                auto v = custom_intrin::__riscv_vabd(v1, v2, vl * 4);
+                auto v = common::__riscv_vabd(v1, v2, vl * 4);
                auto m = __riscv_vle8_v_u8m2(mask + i, vl);
                auto b = __riscv_vmsne(__riscv_vreinterpret_u8m8(__riscv_vmul(__riscv_vzext_vf4(__riscv_vminu(m, 1, vl), vl), 0x01010101, vl)), 0, vl * 4);
                s = __riscv_vwredsumu(__riscv_vwredsumu_tum(b, zero, v, zero, vl * 4), s, __riscv_vsetvlmax_e16m1());
@ -678,7 +678,7 @@ struct MaskedNormDiffL1_RVV<uchar, int> {
                    vl = __riscv_vsetvl_e8m8(len - i);
                    auto v1 = __riscv_vlse8_v_u8m8(src1 + cn * i + cn_index, sizeof(uchar) * cn, vl);
                    auto v2 = __riscv_vlse8_v_u8m8(src2 + cn * i + cn_index, sizeof(uchar) * cn, vl);
-                    auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+                    auto v = common::__riscv_vabd(v1, v2, vl);
                    auto m = __riscv_vle8_v_u8m8(mask + i, vl);
                    auto b = __riscv_vmsne(m, 0, vl);
                    s = __riscv_vwredsumu(__riscv_vwredsumu_tum(b, zero, v, zero, vl), s, __riscv_vsetvlmax_e16m1());
@ -700,7 +700,7 @@ struct MaskedNormDiffL1_RVV<schar, int> {
                vl = __riscv_vsetvl_e8m8(len - i);
                auto v1 = __riscv_vlse8_v_i8m8(src1 + cn * i + cn_index, sizeof(schar) * cn, vl);
                auto v2 = __riscv_vlse8_v_i8m8(src2 + cn * i + cn_index, sizeof(schar) * cn, vl);
-                auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+                auto v = common::__riscv_vabd(v1, v2, vl);
                auto m = __riscv_vle8_v_u8m8(mask + i, vl);
                auto b = __riscv_vmsne(m, 0, vl);
                s = __riscv_vwredsumu(__riscv_vwredsumu_tum(b, zero, v, zero, vl), s, __riscv_vsetvlmax_e16m1());
@ -720,7 +720,7 @@ struct MaskedNormDiffL1_RVV<ushort, int> {
                vl = __riscv_vsetvl_e8m4(len - i);
                auto v1 = __riscv_vlse16_v_u16m8(src1 + cn * i + cn_index, sizeof(ushort) * cn, vl);
                auto v2 = __riscv_vlse16_v_u16m8(src2 + cn * i + cn_index, sizeof(ushort) * cn, vl);
-                auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+                auto v = common::__riscv_vabd(v1, v2, vl);
                auto m = __riscv_vle8_v_u8m4(mask + i, vl);
                auto b = __riscv_vmsne(m, 0, vl);
                s = __riscv_vwredsumu_tum(b, s, v, s, vl);
@ -740,7 +740,7 @@ struct MaskedNormDiffL1_RVV<short, int> {
                vl = __riscv_vsetvl_e8m4(len - i);
                auto v1 = __riscv_vlse16_v_i16m8(src1 + cn * i + cn_index, sizeof(short) * cn, vl);
                auto v2 = __riscv_vlse16_v_i16m8(src2 + cn * i + cn_index, sizeof(short) * cn, vl);
-                auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+                auto v = common::__riscv_vabd(v1, v2, vl);
                auto m = __riscv_vle8_v_u8m4(mask + i, vl);
                auto b = __riscv_vmsne(m, 0, vl);
                s = __riscv_vwredsumu_tum(b, s, v, s, vl);
@ -761,7 +761,8 @@ struct MaskedNormDiffL1_RVV<int, double> {
                vl = __riscv_vsetvl_e32m4(len - i);
                auto v1 = __riscv_vlse32_v_i32m4(src1 + cn * i + cn_index, sizeof(int) * cn, vl);
                auto v2 = __riscv_vlse32_v_i32m4(src2 + cn * i + cn_index, sizeof(int) * cn, vl);
-                auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+                auto v = common::__riscv_vabd(v1, v2, vl); // 5.x
+                // auto v = common::__riscv_vabs(__riscv_vsub(v1, v2, vl), vl); // 4.x
                auto m = __riscv_vle8_v_u8m1(mask + i, vl);
                auto b = __riscv_vmsne(m, 0, vl);
                s = __riscv_vfadd_tumu(b, s, s, __riscv_vfwcvt_f(b, v, vl), vl);
@ -836,7 +837,7 @@ struct MaskedNormDiffL2_RVV<uchar, int> {
                vl = __riscv_vsetvl_e8m4(len - i);
                auto v1 = __riscv_vle8_v_u8m4(src1 + i, vl);
                auto v2 = __riscv_vle8_v_u8m4(src2 + i, vl);
-                auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+                auto v = common::__riscv_vabd(v1, v2, vl);
                auto m = __riscv_vle8_v_u8m4(mask + i, vl);
                auto b = __riscv_vmsne(m, 0, vl);
                s = __riscv_vwredsumu(b, __riscv_vwmulu(b, v, v, vl), s, vl);
@ -847,7 +848,7 @@ struct MaskedNormDiffL2_RVV<uchar, int> {
                vl = __riscv_vsetvl_e8m1(len - i);
                auto v1 = __riscv_vle8_v_u8m4(src1 + i * 4, vl * 4);
                auto v2 = __riscv_vle8_v_u8m4(src2 + i * 4, vl * 4);
-                auto v = custom_intrin::__riscv_vabd(v1, v2, vl * 4);
+                auto v = common::__riscv_vabd(v1, v2, vl * 4);
                auto m = __riscv_vle8_v_u8m1(mask + i, vl);
                auto b = __riscv_vmsne(__riscv_vreinterpret_u8m4(__riscv_vmul(__riscv_vzext_vf4(__riscv_vminu(m, 1, vl), vl), 0x01010101, vl)), 0, vl * 4);
                s = __riscv_vwredsumu(b, __riscv_vwmulu(b, v, v, vl * 4), s, vl * 4);
@ -859,7 +860,7 @@ struct MaskedNormDiffL2_RVV<uchar, int> {
                    vl = __riscv_vsetvl_e8m4(len - i);
                    auto v1 = __riscv_vlse8_v_u8m4(src1 + cn * i + cn_index, sizeof(uchar) * cn, vl);
                    auto v2 = __riscv_vlse8_v_u8m4(src2 + cn * i + cn_index, sizeof(uchar) * cn, vl);
-                    auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+                    auto v = common::__riscv_vabd(v1, v2, vl);
                    auto m = __riscv_vle8_v_u8m4(mask + i, vl);
                    auto b = __riscv_vmsne(m, 0, vl);
                    s = __riscv_vwredsumu(b, __riscv_vwmulu(b, v, v, vl), s, vl);
@ -880,7 +881,7 @@ struct MaskedNormDiffL2_RVV<schar, int> {
                vl = __riscv_vsetvl_e8m4(len - i);
                auto v1 = __riscv_vlse8_v_i8m4(src1 + cn * i + cn_index, sizeof(schar) * cn, vl);
                auto v2 = __riscv_vlse8_v_i8m4(src2 + cn * i + cn_index, sizeof(schar) * cn, vl);
-                auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+                auto v = common::__riscv_vabd(v1, v2, vl);
                auto m = __riscv_vle8_v_u8m4(mask + i, vl);
                auto b = __riscv_vmsne(m, 0, vl);
                s = __riscv_vwredsumu(b, __riscv_vwmulu(b, v, v, vl), s, vl);
@ -901,7 +902,7 @@ struct MaskedNormDiffL2_RVV<ushort, double> {
                vl = __riscv_vsetvl_e16m2(len - i);
                auto v1 = __riscv_vlse16_v_u16m2(src1 + cn * i + cn_index, sizeof(ushort) * cn, vl);
                auto v2 = __riscv_vlse16_v_u16m2(src2 + cn * i + cn_index, sizeof(ushort) * cn, vl);
-                auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+                auto v = common::__riscv_vabd(v1, v2, vl);
                auto m = __riscv_vle8_v_u8m1(mask + i, vl);
                auto b = __riscv_vmsne(m, 0, vl);
                auto v_mul = __riscv_vwmulu(b, v, v, vl);
@ -923,7 +924,7 @@ struct MaskedNormDiffL2_RVV<short, double> {
                vl = __riscv_vsetvl_e16m2(len - i);
                auto v1 = __riscv_vlse16_v_i16m2(src1 + cn * i + cn_index, sizeof(short) * cn, vl);
                auto v2 = __riscv_vlse16_v_i16m2(src2 + cn * i + cn_index, sizeof(short) * cn, vl);
-                auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+                auto v = common::__riscv_vabd(v1, v2, vl);
                auto m = __riscv_vle8_v_u8m1(mask + i, vl);
                auto b = __riscv_vmsne(m, 0, vl);
                auto v_mul = __riscv_vwmulu(b, v, v, vl);
@ -945,7 +946,7 @@ struct MaskedNormDiffL2_RVV<int, double> {
                vl = __riscv_vsetvl_e16m2(len - i);
                auto v1 = __riscv_vlse32_v_i32m4(src1 + cn * i + cn_index, sizeof(int) * cn, vl);
                auto v2 = __riscv_vlse32_v_i32m4(src2 + cn * i + cn_index, sizeof(int) * cn, vl);
-                auto v = custom_intrin::__riscv_vabd(v1, v2, vl);
+                auto v = common::__riscv_vabd(v1, v2, vl);
                auto m = __riscv_vle8_v_u8m1(mask + i, vl);
                auto b = __riscv_vmsne(m, 0, vl);
                auto v_mul = __riscv_vwmulu(b, v, v, vl);
@ -1079,9 +1080,8 @@ CV_HAL_RVV_DEF_NORM_DIFF_ALL(64f, double, double, double, double)
 }

 using NormDiffFunc = int (*)(const uchar*, const uchar*, const uchar*, uchar*, int, int);
-inline int normDiff(const uchar* src1, size_t src1_step, const uchar* src2, size_t src2_step, const uchar* mask,
-                    size_t mask_step, int width, int height, int type, int norm_type, double* result)
-{
+int normDiff(const uchar* src1, size_t src1_step, const uchar* src2, size_t src2_step, const uchar* mask, size_t mask_step,
+             int width, int height, int type, int norm_type, double* result) {
    int depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);

    bool relative = norm_type & NORM_RELATIVE;
@ -1115,19 +1115,9 @@ inline int normDiff(const uchar* src1, size_t src1_step, const uchar* src2, size
        },
    };

-    static const size_t elem_size_tab[CV_DEPTH_MAX] = {
-        sizeof(uchar),      sizeof(schar),
-        sizeof(ushort),     sizeof(short),
-        sizeof(int),        sizeof(float),
-        sizeof(double),     sizeof(cv::hfloat),
-        sizeof(cv::bfloat), sizeof(bool),
-        sizeof(uint64_t),   sizeof(int64_t),
-        sizeof(unsigned),   0,
-    };
-    CV_Assert(elem_size_tab[depth]);
-
-    bool src_continuous = (src1_step == width * elem_size_tab[depth] * cn || (src1_step != width * elem_size_tab[depth] * cn && height == 1));
-    src_continuous &= (src2_step == width * elem_size_tab[depth] * cn || (src2_step != width * elem_size_tab[depth] * cn && height == 1));
+    size_t elem_size1 = static_cast<size_t>(CV_ELEM_SIZE1(type));
+    bool src_continuous = (src1_step == width * elem_size1 * cn || (src1_step != width * elem_size1 * cn && height == 1));
+    src_continuous &= (src2_step == width * elem_size1 * cn || (src2_step != width * elem_size1 * cn && height == 1));
    bool mask_continuous = (mask_step == static_cast<size_t>(width));
    size_t nplanes = 1;
    size_t size = width * height;
@ -1150,7 +1140,7 @@ inline int normDiff(const uchar* src1, size_t src1_step, const uchar* src2, size
    res.d = 0;
    if ((norm_type == NORM_L1 && depth <= CV_16S) ||
        ((norm_type == NORM_L2 || norm_type == NORM_L2SQR) && depth <= CV_8S)) {
-        const size_t esz = elem_size_tab[depth] * cn;
+        const size_t esz = elem_size1 * cn;
        const int total = (int)size;
        const int intSumBlockSize = (norm_type == NORM_L1 && depth <= CV_8S ? (1 << 23) : (1 << 15))/cn;
        const int blockSize = std::min(total, intSumBlockSize);
@ -1210,7 +1200,7 @@ inline int normDiff(const uchar* src1, size_t src1_step, const uchar* src2, size
    if(relative)
    {
        double result_;
-        int ret = cv::cv_hal_rvv::norm::norm(src2, src2_step, mask, mask_step, width, height, type, norm_type, &result_);
+        int ret = cv::rvv_hal::core::norm(src2, src2_step, mask, mask_step, width, height, type, norm_type, &result_);
        if(ret == CV_HAL_ERROR_OK)
        {
            *result /= result_ + DBL_EPSILON;
@ -1220,6 +1210,6 @@ inline int normDiff(const uchar* src1, size_t src1_step, const uchar* src2, size
    return CV_HAL_ERROR_OK;
 }

-}}}
+#endif // CV_HAL_RVV_1P0_ENABLED

-#endif
+}}} // cv::rvv_hal::core
--- a/hal/riscv-rvv/hal_rvv_1p0/norm_hamming.hpp
+++ b/hal/riscv-rvv/hal_rvv_1p0/norm_hamming.hpp
@ -4,18 +4,11 @@

 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.

-#ifndef OPENCV_HAL_RVV_NORM_HAMMING_HPP_INCLUDED
-#define OPENCV_HAL_RVV_NORM_HAMMING_HPP_INCLUDED
+#include "rvv_hal.hpp"

-#include <riscv_vector.h>
-#include <opencv2/core/base.hpp>
+namespace cv { namespace rvv_hal { namespace core {

-namespace cv { namespace cv_hal_rvv {
-
-#undef cv_hal_normHamming8u
-#define cv_hal_normHamming8u cv::cv_hal_rvv::normHamming8u
-#undef cv_hal_normHammingDiff8u
-#define cv_hal_normHammingDiff8u cv::cv_hal_rvv::normHammingDiff8u
+#if CV_HAL_RVV_1P0_ENABLED

 template <typename CellType>
 inline void normHammingCnt_m8(vuint8m8_t v, vbool1_t mask, size_t len_bool, size_t& result)
@ -153,7 +146,7 @@ inline void normHammingDiff8uLoop(const uchar* a, const uchar* b, size_t n, size
    }
 }

-inline int normHamming8u(const uchar* a, int n, int cellSize, int* result)
+int normHamming8u(const uchar* a, int n, int cellSize, int* result)
 {
    size_t _result = 0;

@ -168,7 +161,7 @@ inline int normHamming8u(const uchar* a, int n, int cellSize, int* result)
    return CV_HAL_ERROR_OK;
 }

-inline int normHammingDiff8u(const uchar* a, const uchar* b, int n, int cellSize, int* result)
+int normHammingDiff8u(const uchar* a, const uchar* b, int n, int cellSize, int* result)
 {
    size_t _result = 0;

@ -183,6 +176,6 @@ inline int normHammingDiff8u(const uchar* a, const uchar* b, int n, int cellSize
    return CV_HAL_ERROR_OK;
 }

-}}  // namespace cv::cv_hal_rvv
+#endif // CV_HAL_RVV_1P0_ENABLED

-#endif //OPENCV_HAL_RVV_NORM_HAMMING_HPP_INCLUDED
+}}}  // cv::rvv_hal::core
--- a/hal/riscv-rvv/src/core/polar_to_cart.cpp
+++ b/hal/riscv-rvv/src/core/polar_to_cart.cpp
@ -1,16 +1,16 @@
 // This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level
-// directory of this distribution and at http://opencv.org/license.html.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.

 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.

-#ifndef OPENCV_HAL_RVV_SINCOS_HPP_INCLUDED
-#define OPENCV_HAL_RVV_SINCOS_HPP_INCLUDED
+#include "rvv_hal.hpp"

-#include <riscv_vector.h>
-#include "hal_rvv_1p0/types.hpp"
+namespace cv { namespace rvv_hal { namespace core {

-namespace cv { namespace cv_hal_rvv { namespace detail {
+#if CV_HAL_RVV_1P0_ENABLED
+
+namespace {

 static constexpr size_t sincos_mask = 0x3;

@ -67,6 +67,44 @@ static inline void
    cosval = __riscv_vfneg_mu(__riscv_vmor(idx1, idx2, vl), cosval, cosval, vl);
 }

-}}}  // namespace cv::cv_hal_rvv::detail
+template <typename RVV_T, typename Elem = typename RVV_T::ElemType>
+inline int polarToCart(const Elem* mag, const Elem* angle, Elem* x, Elem* y, int len, bool angleInDegrees)
+{
+    using T = RVV_F32M4;
+    const auto sincos_scale = angleInDegrees ? sincos_deg_scale : sincos_rad_scale;

-#endif  // OPENCV_HAL_RVV_SINCOS_HPP_INCLUDED
+    size_t vl;
+    auto cos_p2 = T::vmv(sincos_cos_p2, T::setvlmax());
+    auto cos_p0 = T::vmv(sincos_cos_p0, T::setvlmax());
+    for (; len > 0; len -= (int)vl, angle += vl, x += vl, y += vl)
+    {
+        vl = RVV_T::setvl(len);
+        auto vangle = T::cast(RVV_T::vload(angle, vl), vl);
+        T::VecType vsin, vcos;
+        SinCos32f<T>(vangle, vsin, vcos, sincos_scale, cos_p2, cos_p0, vl);
+        if (mag)
+        {
+            auto vmag = T::cast(RVV_T::vload(mag, vl), vl);
+            vsin = __riscv_vfmul(vsin, vmag, vl);
+            vcos = __riscv_vfmul(vcos, vmag, vl);
+            mag += vl;
+        }
+        RVV_T::vstore(x, RVV_T::cast(vcos, vl), vl);
+        RVV_T::vstore(y, RVV_T::cast(vsin, vl), vl);
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+} // anonymous
+
+int polarToCart32f(const float* mag, const float* angle, float* x, float* y, int len, bool angleInDegrees) {
+    return polarToCart<RVV_F32M4>(mag, angle, x, y, len, angleInDegrees);
+}
+int polarToCart64f(const double* mag, const double* angle, double* x, double* y, int len, bool angleInDegrees) {
+    return polarToCart<RVV_F64M8>(mag, angle, x, y, len, angleInDegrees);
+}
+
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+}}}  // cv::rvv_hal::core
--- a/hal/riscv-rvv/hal_rvv_1p0/qr.hpp
+++ b/hal/riscv-rvv/hal_rvv_1p0/qr.hpp
@ -4,22 +4,17 @@

 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.

-#ifndef OPENCV_HAL_RVV_QR_HPP_INCLUDED
-#define OPENCV_HAL_RVV_QR_HPP_INCLUDED
-
+#include "rvv_hal.hpp"
 #include <cfloat>
 #include <cmath>
 #include <typeinfo>
 #include <vector>
-#include <riscv_vector.h>
-#include "hal_rvv_1p0/types.hpp"

-namespace cv { namespace cv_hal_rvv { namespace qr {
+namespace cv { namespace rvv_hal { namespace core {

-#undef cv_hal_QR32f
-#define cv_hal_QR32f cv::cv_hal_rvv::qr::QR<cv::cv_hal_rvv::RVV_F32M4>
-#undef cv_hal_QR64f
-#define cv_hal_QR64f cv::cv_hal_rvv::qr::QR<cv::cv_hal_rvv::RVV_F64M4>
+#if CV_HAL_RVV_1P0_ENABLED
+
+namespace {

 // the algorithm is copied from core/src/matrix_decomp.cpp,
 // in the function template static int cv::QRImpl
@ -171,6 +166,15 @@ inline int QR(T* src1, size_t src1_step, int m, int n, int k, T* src2, size_t sr
    return CV_HAL_ERROR_OK;
 }

-}}}
+} // anonymous

-#endif
+int QR32f(float* src1, size_t src1_step, int m, int n, int k, float* src2, size_t src2_step, float* dst, int* info) {
+    return QR<RVV_F32M4>(src1, src1_step, m, n, k, src2, src2_step, dst, info);
+}
+int QR64f(double* src1, size_t src1_step, int m, int n, int k, double* src2, size_t src2_step, double* dst, int* info) {
+    return QR<RVV_F64M4>(src1, src1_step, m, n, k, src2, src2_step, dst, info);
+}
+
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+}}} // cv::rvv_hal::core
--- a/hal/riscv-rvv/hal_rvv_1p0/split.hpp
+++ b/hal/riscv-rvv/hal_rvv_1p0/split.hpp
@ -1,17 +1,14 @@
 // This file is part of OpenCV project.
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
-#ifndef OPENCV_HAL_RVV_SPLIT_HPP_INCLUDED
-#define OPENCV_HAL_RVV_SPLIT_HPP_INCLUDED

-#include <riscv_vector.h>
+#include "rvv_hal.hpp"

-namespace cv { namespace cv_hal_rvv {
+namespace cv { namespace rvv_hal { namespace core {

-#undef cv_hal_split8u
-#define cv_hal_split8u cv::cv_hal_rvv::split8u
+#if CV_HAL_RVV_1P0_ENABLED

-inline int split8u(const uchar* src, uchar** dst, int len, int cn)
+int split8u(const uchar* src, uchar** dst, int len, int cn)
 {
    int vl = 0;
    if (cn == 1)
@ -89,5 +86,6 @@ inline int split8u(const uchar* src, uchar** dst, int len, int cn)
    return CV_HAL_ERROR_OK;
 }

-}}
-#endif
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+}}} // cv::rvv_hal::core
--- a/hal/riscv-rvv/src/core/sqrt.cpp
+++ b/hal/riscv-rvv/src/core/sqrt.cpp
@ -0,0 +1,74 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level
+// directory of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
+
+#include "rvv_hal.hpp"
+#include "common.hpp"
+
+namespace cv { namespace rvv_hal { namespace core {
+
+#if CV_HAL_RVV_1P0_ENABLED
+
+namespace {
+
+template <typename SQRT_T, typename Elem = typename SQRT_T::T::ElemType>
+inline int sqrt(const Elem* src, Elem* dst, int _len)
+{
+    size_t vl;
+    for (size_t len = _len; len > 0; len -= vl, src += vl, dst += vl)
+    {
+        vl = SQRT_T::T::setvl(len);
+        auto x = SQRT_T::T::vload(src, vl);
+        SQRT_T::T::vstore(dst, common::sqrt<SQRT_T::iter_times>(x, vl), vl);
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+template <typename SQRT_T, typename Elem = typename SQRT_T::T::ElemType>
+inline int invSqrt(const Elem* src, Elem* dst, int _len)
+{
+    size_t vl;
+    for (size_t len = _len; len > 0; len -= vl, src += vl, dst += vl)
+    {
+        vl = SQRT_T::T::setvl(len);
+        auto x = SQRT_T::T::vload(src, vl);
+        SQRT_T::T::vstore(dst, common::invSqrt<SQRT_T::iter_times>(x, vl), vl);
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+} // anonymous
+
+int sqrt32f(const float* src, float* dst, int len) {
+    return sqrt<common::Sqrt32f<RVV_F32M8>>(src, dst, len);
+}
+int sqrt64f(const double* src, double* dst, int len) {
+    return sqrt<common::Sqrt64f<RVV_F64M8>>(src, dst, len);
+}
+
+int invSqrt32f(const float* src, float* dst, int len) {
+#ifdef __clang__
+// Strange bug in clang: invSqrt use 2 LMUL registers to store mask, which will cause memory access.
+// So a smaller LMUL is used here.
+    return invSqrt<common::Sqrt32f<RVV_F32M4>>(src, dst, len);
+#else
+    return invSqrt<common::Sqrt32f<RVV_F32M8>>(src, dst, len);
+#endif
+}
+int invSqrt64f(const double* src, double* dst, int len) {
+#ifdef __clang__
+// Strange bug in clang: invSqrt use 2 LMUL registers to store mask, which will cause memory access.
+// So a smaller LMUL is used here.
+    return invSqrt<common::Sqrt64f<RVV_F64M4>>(src, dst, len);
+#else
+    return invSqrt<common::Sqrt64f<RVV_F64M8>>(src, dst, len);
+#endif
+}
+
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+}}}  // cv::rvv_hal::core
--- a/hal/riscv-rvv/hal_rvv_1p0/svd.hpp
+++ b/hal/riscv-rvv/hal_rvv_1p0/svd.hpp
@ -4,22 +4,17 @@

 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.

-#ifndef OPENCV_HAL_RVV_SVD_HPP_INCLUDED
-#define OPENCV_HAL_RVV_SVD_HPP_INCLUDED
-
+#include "rvv_hal.hpp"
 #include <cfloat>
 #include <cmath>
 #include <typeinfo>
 #include <vector>
-#include <riscv_vector.h>
-#include "hal_rvv_1p0/types.hpp"

-namespace cv { namespace cv_hal_rvv { namespace svd {
+namespace cv { namespace rvv_hal { namespace core {

-#undef cv_hal_SVD32f
-#define cv_hal_SVD32f cv::cv_hal_rvv::svd::SVD<cv::cv_hal_rvv::RVV_F32M4>
-#undef cv_hal_SVD64f
-#define cv_hal_SVD64f cv::cv_hal_rvv::svd::SVD<cv::cv_hal_rvv::RVV_F64M4>
+#if CV_HAL_RVV_1P0_ENABLED
+
+namespace {

 // the algorithm is copied from core/src/lapack.cpp,
 // in the function template static void cv::JacobiSVDImpl_
@ -268,6 +263,15 @@ inline int SVD(T* src, size_t src_step, T* w, T*, size_t, T* vt, size_t vt_step,
    return CV_HAL_ERROR_OK;
 }

-}}}
+} // anonymous

-#endif
+int SVD32f(float* src, size_t src_step, float* w, float* u, size_t u_step, float* vt, size_t vt_step, int m, int n, int flags) {
+    return SVD<RVV_F32M4>(src, src_step, w, u, u_step, vt, vt_step, m, n, flags);
+}
+int SVD64f(double* src, size_t src_step, double* w, double* u, size_t u_step, double* vt, size_t vt_step, int m, int n, int flags) {
+    return SVD<RVV_F64M4>(src, src_step, w, u, u_step, vt, vt_step, m, n, flags);
+}
+
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+}}} // cv::rvv_hal::core
--- a/hal/riscv-rvv/hal_rvv_1p0/transpose.hpp
+++ b/hal/riscv-rvv/hal_rvv_1p0/transpose.hpp
@ -5,12 +5,7 @@
 // Copyright (C) 2025, SpaceMIT Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.

-#ifndef OPENCV_HAL_RVV_TRANSPOSE_HPP_INCLUDED
-#define OPENCV_HAL_RVV_TRANSPOSE_HPP_INCLUDED
-
-#include <riscv_vector.h>
-
-namespace cv { namespace cv_hal_rvv { namespace transpose {
+#include "rvv_hal.hpp"

 #if defined (__clang__) && __clang_major__ < 18
 #define OPENCV_HAL_IMPL_RVV_VCREATE_x4(suffix, width, v0, v1, v2, v3) \
@ -35,18 +30,22 @@ namespace cv { namespace cv_hal_rvv { namespace transpose {
 #define __riscv_vcreate_v_i64m1x8(v0, v1, v2, v3, v4, v5, v6, v7) OPENCV_HAL_IMPL_RVV_VCREATE_x8(i64, 1, v0, v1, v2, v3, v4, v5, v6, v7)
 #endif

+namespace cv { namespace rvv_hal { namespace core {
+
+#if CV_HAL_RVV_1P0_ENABLED
+
 static void transpose2d_8u(const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step, int src_width, int src_height) {
-    auto transpose_8u_8xVl = [](const uchar *src, size_t src_step, uchar *dst, size_t dst_step, const int vl) {
+    auto transpose_8u_8xVl = [](const uchar *src, size_t sstep, uchar *dst, size_t dstep, const int vl) {
        auto v0 = __riscv_vle8_v_u8m1(src, vl);
-        auto v1 = __riscv_vle8_v_u8m1(src + src_step, vl);
-        auto v2 = __riscv_vle8_v_u8m1(src + 2 * src_step, vl);
-        auto v3 = __riscv_vle8_v_u8m1(src + 3 * src_step, vl);
-        auto v4 = __riscv_vle8_v_u8m1(src + 4 * src_step, vl);
-        auto v5 = __riscv_vle8_v_u8m1(src + 5 * src_step, vl);
-        auto v6 = __riscv_vle8_v_u8m1(src + 6 * src_step, vl);
-        auto v7 = __riscv_vle8_v_u8m1(src + 7 * src_step, vl);
+        auto v1 = __riscv_vle8_v_u8m1(src + sstep, vl);
+        auto v2 = __riscv_vle8_v_u8m1(src + 2 * sstep, vl);
+        auto v3 = __riscv_vle8_v_u8m1(src + 3 * sstep, vl);
+        auto v4 = __riscv_vle8_v_u8m1(src + 4 * sstep, vl);
+        auto v5 = __riscv_vle8_v_u8m1(src + 5 * sstep, vl);
+        auto v6 = __riscv_vle8_v_u8m1(src + 6 * sstep, vl);
+        auto v7 = __riscv_vle8_v_u8m1(src + 7 * sstep, vl);
        vuint8m1x8_t v = __riscv_vcreate_v_u8m1x8(v0, v1, v2, v3, v4, v5, v6, v7);
-        __riscv_vssseg8e8(dst, dst_step, v, vl);
+        __riscv_vssseg8e8(dst, dstep, v, vl);
    };

    int h = 0, w = 0;
@ -72,17 +71,17 @@ static void transpose2d_8u(const uchar *src_data, size_t src_step, uchar *dst_da
 }

 static void transpose2d_16u(const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step, int src_width, int src_height) {
-    auto transpose_16u_8xVl = [](const ushort *src, size_t src_step, ushort *dst, size_t dst_step, const int vl) {
+    auto transpose_16u_8xVl = [](const ushort *src, size_t sstep, ushort *dst, size_t dstep, const int vl) {
        auto v0 = __riscv_vle16_v_u16m1(src, vl);
-        auto v1 = __riscv_vle16_v_u16m1(src + src_step, vl);
-        auto v2 = __riscv_vle16_v_u16m1(src + 2 * src_step, vl);
-        auto v3 = __riscv_vle16_v_u16m1(src + 3 * src_step, vl);
-        auto v4 = __riscv_vle16_v_u16m1(src + 4 * src_step, vl);
-        auto v5 = __riscv_vle16_v_u16m1(src + 5 * src_step, vl);
-        auto v6 = __riscv_vle16_v_u16m1(src + 6 * src_step, vl);
-        auto v7 = __riscv_vle16_v_u16m1(src + 7 * src_step, vl);
+        auto v1 = __riscv_vle16_v_u16m1(src + sstep, vl);
+        auto v2 = __riscv_vle16_v_u16m1(src + 2 * sstep, vl);
+        auto v3 = __riscv_vle16_v_u16m1(src + 3 * sstep, vl);
+        auto v4 = __riscv_vle16_v_u16m1(src + 4 * sstep, vl);
+        auto v5 = __riscv_vle16_v_u16m1(src + 5 * sstep, vl);
+        auto v6 = __riscv_vle16_v_u16m1(src + 6 * sstep, vl);
+        auto v7 = __riscv_vle16_v_u16m1(src + 7 * sstep, vl);
        vuint16m1x8_t v = __riscv_vcreate_v_u16m1x8(v0, v1, v2, v3, v4, v5, v6, v7);
-        __riscv_vssseg8e16(dst, dst_step, v, vl);
+        __riscv_vssseg8e16(dst, dstep, v, vl);
    };

    size_t src_step_base = src_step / sizeof(ushort);
@ -111,13 +110,13 @@ static void transpose2d_16u(const uchar *src_data, size_t src_step, uchar *dst_d
 }

 static void transpose2d_32s(const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step, int src_width, int src_height) {
-    auto transpose_32s_4xVl = [](const int *src, size_t src_step, int *dst, size_t dst_step, const int vl) {
+    auto transpose_32s_4xVl = [](const int *src, size_t sstep, int *dst, size_t dstep, const int vl) {
        auto v0 = __riscv_vle32_v_i32m1(src, vl);
-        auto v1 = __riscv_vle32_v_i32m1(src + src_step, vl);
-        auto v2 = __riscv_vle32_v_i32m1(src + 2 * src_step, vl);
-        auto v3 = __riscv_vle32_v_i32m1(src + 3 * src_step, vl);
+        auto v1 = __riscv_vle32_v_i32m1(src + sstep, vl);
+        auto v2 = __riscv_vle32_v_i32m1(src + 2 * sstep, vl);
+        auto v3 = __riscv_vle32_v_i32m1(src + 3 * sstep, vl);
        vint32m1x4_t v = __riscv_vcreate_v_i32m1x4(v0, v1, v2, v3);
-        __riscv_vssseg4e32(dst, dst_step, v, vl);
+        __riscv_vssseg4e32(dst, dstep, v, vl);
    };

    size_t src_step_base = src_step / sizeof(int);
@ -146,17 +145,17 @@ static void transpose2d_32s(const uchar *src_data, size_t src_step, uchar *dst_d
 }

 static void transpose2d_32sC2(const uchar *src_data, size_t src_step, uchar *dst_data, size_t dst_step, int src_width, int src_height) {
-    auto transpose_64s_8xVl = [](const int64_t *src, size_t src_step, int64_t *dst, size_t dst_step, const int vl) {
+    auto transpose_64s_8xVl = [](const int64_t *src, size_t sstep, int64_t *dst, size_t dstep, const int vl) {
        auto v0 = __riscv_vle64_v_i64m1(src, vl);
-        auto v1 = __riscv_vle64_v_i64m1(src + src_step, vl);
-        auto v2 = __riscv_vle64_v_i64m1(src + 2 * src_step, vl);
-        auto v3 = __riscv_vle64_v_i64m1(src + 3 * src_step, vl);
-        auto v4 = __riscv_vle64_v_i64m1(src + 4 * src_step, vl);
-        auto v5 = __riscv_vle64_v_i64m1(src + 5 * src_step, vl);
-        auto v6 = __riscv_vle64_v_i64m1(src + 6 * src_step, vl);
-        auto v7 = __riscv_vle64_v_i64m1(src + 7 * src_step, vl);
+        auto v1 = __riscv_vle64_v_i64m1(src + sstep, vl);
+        auto v2 = __riscv_vle64_v_i64m1(src + 2 * sstep, vl);
+        auto v3 = __riscv_vle64_v_i64m1(src + 3 * sstep, vl);
+        auto v4 = __riscv_vle64_v_i64m1(src + 4 * sstep, vl);
+        auto v5 = __riscv_vle64_v_i64m1(src + 5 * sstep, vl);
+        auto v6 = __riscv_vle64_v_i64m1(src + 6 * sstep, vl);
+        auto v7 = __riscv_vle64_v_i64m1(src + 7 * sstep, vl);
        vint64m1x8_t v = __riscv_vcreate_v_i64m1x8(v0, v1, v2, v3, v4, v5, v6, v7);
-        __riscv_vssseg8e64(dst, dst_step, v, vl);
+        __riscv_vssseg8e64(dst, dstep, v, vl);
    };

    size_t src_step_base = src_step / sizeof(int64_t);
@ -184,11 +183,8 @@ static void transpose2d_32sC2(const uchar *src_data, size_t src_step, uchar *dst
    }
 }

-#undef cv_hal_transpose2d
-#define cv_hal_transpose2d cv::cv_hal_rvv::transpose::transpose2d
-
 using Transpose2dFunc = void (*)(const uchar*, size_t, uchar*, size_t, int, int);
-inline int transpose2d(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step,
+int transpose2d(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step,
                int src_width, int src_height, int element_size) {
    if (src_data == dst_data) {
        return CV_HAL_ERROR_NOT_IMPLEMENTED;
@ -205,7 +201,7 @@ inline int transpose2d(const uchar* src_data, size_t src_step, uchar* dst_data,
        0, 0, 0, 0,
        0
    };
-    Transpose2dFunc func = tab[element_size];
+    Transpose2dFunc func = element_size <= 32 ? tab[element_size] : nullptr;
    if (!func) {
        return CV_HAL_ERROR_NOT_IMPLEMENTED;
    }
@ -215,6 +211,6 @@ inline int transpose2d(const uchar* src_data, size_t src_step, uchar* dst_data,
    return CV_HAL_ERROR_OK;
 }

-}}} // cv::cv_hal_rvv::transpose
+#endif // CV_HAL_RVV_1P0_ENABLED

-#endif // OPENCV_HAL_RVV_TRANSPOSE_HPP_INCLUDED
+}}} // cv::rvv_hal::core
--- a/hal/riscv-rvv/src/imgproc/bilateral_filter.cpp
+++ b/hal/riscv-rvv/src/imgproc/bilateral_filter.cpp
@ -0,0 +1,361 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
+
+#include "rvv_hal.hpp"
+#include "common.hpp"
+
+namespace cv { namespace rvv_hal { namespace imgproc {
+
+#if CV_HAL_RVV_1P0_ENABLED
+
+namespace {
+
+// the algorithm is copied from imgproc/src/bilateral_filter.simd.cpp
+// in the functor BilateralFilter_8u_Invoker
+static inline int bilateralFilter8UC1(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int radius, int maxk, const int* space_ofs, const float* space_weight, const float* color_weight)
+{
+    constexpr int align = 31;
+    std::vector<float> _sum(width + align), _wsum(width + align);
+    float* sum = reinterpret_cast<float*>(((size_t)_sum.data() + align) & ~align);
+    float* wsum = reinterpret_cast<float*>(((size_t)_wsum.data() + align) & ~align);
+
+    for (int i = start; i < end; i++)
+    {
+        const uchar* sptr = src_data + (i+radius) * src_step + radius;
+        memset(sum, 0, sizeof(float) * width);
+        memset(wsum, 0, sizeof(float) * width);
+        for(int k = 0; k < maxk; k++)
+        {
+            const uchar* ksptr = sptr + space_ofs[k];
+            int vl;
+            for (int j = 0; j < width; j += vl)
+            {
+                vl = __riscv_vsetvl_e8m2(width - j);
+                auto src = __riscv_vle8_v_u8m2(sptr + j, vl);
+                auto ksrc = __riscv_vle8_v_u8m2(ksptr + j, vl);
+                auto diff = __riscv_vsub(__riscv_vmaxu(src, ksrc, vl), __riscv_vminu(src, ksrc, vl), vl);
+                auto w = __riscv_vloxei16_v_f32m8(color_weight, __riscv_vmul(__riscv_vzext_vf2(diff, vl), sizeof(float), vl), vl);
+                w = __riscv_vfmul(w, space_weight[k], vl);
+
+                __riscv_vse32(wsum + j, __riscv_vfadd(w, __riscv_vle32_v_f32m8(wsum + j, vl), vl), vl);
+                __riscv_vse32(sum + j, __riscv_vfmadd(w, __riscv_vfwcvt_f(__riscv_vzext_vf2(ksrc, vl), vl), __riscv_vle32_v_f32m8(sum + j, vl), vl), vl);
+            }
+        }
+
+        int vl;
+        for (int j = 0; j < width; j += vl)
+        {
+            vl = __riscv_vsetvl_e8m2(width - j);
+            auto dst = __riscv_vfncvt_xu(__riscv_vfdiv(__riscv_vle32_v_f32m8(sum + j, vl), __riscv_vle32_v_f32m8(wsum + j, vl), vl), vl);
+            __riscv_vse8(dst_data + i * dst_step + j, __riscv_vncvt_x(dst, vl), vl);
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+static inline int bilateralFilter8UC3(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int radius, int maxk, const int* space_ofs, const float* space_weight, const float* color_weight)
+{
+    constexpr int align = 31;
+    std::vector<float> _sum_b(width + align), _sum_g(width + align), _sum_r(width + align), _wsum(width + align);
+    float* sum_b = reinterpret_cast<float*>(((size_t)_sum_b.data() + align) & ~align);
+    float* sum_g = reinterpret_cast<float*>(((size_t)_sum_g.data() + align) & ~align);
+    float* sum_r = reinterpret_cast<float*>(((size_t)_sum_r.data() + align) & ~align);
+    float* wsum = reinterpret_cast<float*>(((size_t)_wsum.data() + align) & ~align);
+
+    for (int i = start; i < end; i++)
+    {
+        const uchar* sptr = src_data + (i+radius) * src_step + radius*3;
+        memset(sum_b, 0, sizeof(float) * width);
+        memset(sum_g, 0, sizeof(float) * width);
+        memset(sum_r, 0, sizeof(float) * width);
+        memset(wsum, 0, sizeof(float) * width);
+        for(int k = 0; k < maxk; k++)
+        {
+            const uchar* ksptr = sptr + space_ofs[k];
+            int vl;
+            for (int j = 0; j < width; j += vl)
+            {
+                vl = __riscv_vsetvl_e8m2(width - j);
+                auto src = __riscv_vlseg3e8_v_u8m2x3(sptr + j * 3, vl);
+                auto src0 = __riscv_vget_v_u8m2x3_u8m2(src, 0);
+                auto src1 = __riscv_vget_v_u8m2x3_u8m2(src, 1);
+                auto src2 = __riscv_vget_v_u8m2x3_u8m2(src, 2);
+                src = __riscv_vlseg3e8_v_u8m2x3(ksptr + j * 3, vl);
+                auto ksrc0 = __riscv_vget_v_u8m2x3_u8m2(src, 0);
+                auto ksrc1 = __riscv_vget_v_u8m2x3_u8m2(src, 1);
+                auto ksrc2 = __riscv_vget_v_u8m2x3_u8m2(src, 2);
+
+                auto diff0 = __riscv_vsub(__riscv_vmaxu(src0, ksrc0, vl), __riscv_vminu(src0, ksrc0, vl), vl);
+                auto diff1 = __riscv_vsub(__riscv_vmaxu(src1, ksrc1, vl), __riscv_vminu(src1, ksrc1, vl), vl);
+                auto diff2 = __riscv_vsub(__riscv_vmaxu(src2, ksrc2, vl), __riscv_vminu(src2, ksrc2, vl), vl);
+                auto w = __riscv_vloxei16_v_f32m8(color_weight, __riscv_vmul(__riscv_vadd(__riscv_vadd(__riscv_vzext_vf2(diff0, vl), __riscv_vzext_vf2(diff1, vl), vl), __riscv_vzext_vf2(diff2, vl), vl), sizeof(float), vl), vl);
+                w = __riscv_vfmul(w, space_weight[k], vl);
+
+                __riscv_vse32(wsum + j, __riscv_vfadd(w, __riscv_vle32_v_f32m8(wsum + j, vl), vl), vl);
+                __riscv_vse32(sum_b + j, __riscv_vfmadd(w, __riscv_vfwcvt_f(__riscv_vzext_vf2(ksrc0, vl), vl), __riscv_vle32_v_f32m8(sum_b + j, vl), vl), vl);
+                __riscv_vse32(sum_g + j, __riscv_vfmadd(w, __riscv_vfwcvt_f(__riscv_vzext_vf2(ksrc1, vl), vl), __riscv_vle32_v_f32m8(sum_g + j, vl), vl), vl);
+                __riscv_vse32(sum_r + j, __riscv_vfmadd(w, __riscv_vfwcvt_f(__riscv_vzext_vf2(ksrc2, vl), vl), __riscv_vle32_v_f32m8(sum_r + j, vl), vl), vl);
+            }
+        }
+
+        int vl;
+        for (int j = 0; j < width; j += vl)
+        {
+            vl = __riscv_vsetvl_e8m2(width - j);
+            auto w = __riscv_vfrdiv(__riscv_vle32_v_f32m8(wsum + j, vl), 1.0f, vl);
+            vuint8m2x3_t dst{};
+            dst = __riscv_vset_v_u8m2_u8m2x3(dst, 0,__riscv_vncvt_x(__riscv_vfncvt_xu(__riscv_vfmul(__riscv_vle32_v_f32m8(sum_b + j, vl), w, vl), vl), vl));
+            dst = __riscv_vset_v_u8m2_u8m2x3(dst, 1,__riscv_vncvt_x(__riscv_vfncvt_xu(__riscv_vfmul(__riscv_vle32_v_f32m8(sum_g + j, vl), w, vl), vl), vl));
+            dst = __riscv_vset_v_u8m2_u8m2x3(dst, 2,__riscv_vncvt_x(__riscv_vfncvt_xu(__riscv_vfmul(__riscv_vle32_v_f32m8(sum_r + j, vl), w, vl), vl), vl));
+            __riscv_vsseg3e8(dst_data + i * dst_step + j * 3, dst, vl);
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+// the algorithm is copied from imgproc/src/bilateral_filter.simd.cpp
+// in the functor BilateralFilter_32f_Invoker
+static inline int bilateralFilter32FC1(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int radius, int maxk, const int* space_ofs, const float* space_weight, const float* expLUT, float scale_index)
+{
+    constexpr int align = 31;
+    std::vector<float> _sum(width + align), _wsum(width + align);
+    float* sum = reinterpret_cast<float*>(((size_t)_sum.data() + align) & ~align);
+    float* wsum = reinterpret_cast<float*>(((size_t)_wsum.data() + align) & ~align);
+
+    for (int i = start; i < end; i++)
+    {
+        const float* sptr = reinterpret_cast<const float*>(src_data + (i+radius) * src_step) + radius;
+        memset(sum, 0, sizeof(float) * width);
+        memset(wsum, 0, sizeof(float) * width);
+        for(int k = 0; k < maxk; k++)
+        {
+            const float* ksptr = sptr + space_ofs[k];
+            int vl;
+            for (int j = 0; j < width; j += vl)
+            {
+                vl = __riscv_vsetvl_e32m4(width - j);
+                auto src = __riscv_vle32_v_f32m4(sptr + j, vl);
+                auto ksrc = __riscv_vle32_v_f32m4(ksptr + j, vl);
+                auto diff = __riscv_vfmul(__riscv_vfabs(__riscv_vfsub(src, ksrc, vl), vl), scale_index, vl);
+                auto idx = __riscv_vfcvt_rtz_x(diff, vl);
+                auto alpha = __riscv_vfsub(diff, __riscv_vfcvt_f(idx, vl), vl);
+
+                auto exp = __riscv_vloxseg2ei32_v_f32m4x2(expLUT, __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vmul(idx, sizeof(float), vl)), vl);
+                auto w = __riscv_vfmadd(alpha, __riscv_vfsub(__riscv_vget_v_f32m4x2_f32m4(exp, 1), __riscv_vget_v_f32m4x2_f32m4(exp, 0), vl), __riscv_vget_v_f32m4x2_f32m4(exp, 0), vl);
+                w = __riscv_vfmul(w, space_weight[k], vl);
+
+                __riscv_vse32(wsum + j, __riscv_vfadd(w, __riscv_vle32_v_f32m4(wsum + j, vl), vl), vl);
+                __riscv_vse32(sum + j, __riscv_vfmadd(w, ksrc, __riscv_vle32_v_f32m4(sum + j, vl), vl), vl);
+            }
+        }
+
+        int vl;
+        for (int j = 0; j < width; j += vl)
+        {
+            vl = __riscv_vsetvl_e32m4(width - j);
+            auto src = __riscv_vle32_v_f32m4(sptr + j, vl);
+            auto dst = __riscv_vfdiv(__riscv_vfadd(__riscv_vle32_v_f32m4(sum + j, vl), src, vl), __riscv_vfadd(__riscv_vle32_v_f32m4(wsum + j, vl), 1, vl), vl);
+            __riscv_vse32(reinterpret_cast<float*>(dst_data + i * dst_step) + j, dst, vl);
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+static inline int bilateralFilter32FC3(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int radius, int maxk, const int* space_ofs, const float* space_weight, const float* expLUT, float scale_index)
+{
+    constexpr int align = 31;
+    std::vector<float> _sum_b(width + align), _sum_g(width + align), _sum_r(width + align), _wsum(width + align);
+    float* sum_b = reinterpret_cast<float*>(((size_t)_sum_b.data() + align) & ~align);
+    float* sum_g = reinterpret_cast<float*>(((size_t)_sum_g.data() + align) & ~align);
+    float* sum_r = reinterpret_cast<float*>(((size_t)_sum_r.data() + align) & ~align);
+    float* wsum = reinterpret_cast<float*>(((size_t)_wsum.data() + align) & ~align);
+
+    for (int i = start; i < end; i++)
+    {
+        const float* sptr = reinterpret_cast<const float*>(src_data + (i+radius) * src_step) + radius*3;
+        memset(sum_b, 0, sizeof(float) * width);
+        memset(sum_g, 0, sizeof(float) * width);
+        memset(sum_r, 0, sizeof(float) * width);
+        memset(wsum, 0, sizeof(float) * width);
+        for(int k = 0; k < maxk; k++)
+        {
+            const float* ksptr = sptr + space_ofs[k];
+            int vl;
+            for (int j = 0; j < width; j += vl)
+            {
+                vl = __riscv_vsetvl_e32m2(width - j);
+                auto src = __riscv_vlseg3e32_v_f32m2x3(sptr + j * 3, vl);
+                auto src0 = __riscv_vget_v_f32m2x3_f32m2(src, 0);
+                auto src1 = __riscv_vget_v_f32m2x3_f32m2(src, 1);
+                auto src2 = __riscv_vget_v_f32m2x3_f32m2(src, 2);
+                src = __riscv_vlseg3e32_v_f32m2x3(ksptr + j * 3, vl);
+                auto ksrc0 = __riscv_vget_v_f32m2x3_f32m2(src, 0);
+                auto ksrc1 = __riscv_vget_v_f32m2x3_f32m2(src, 1);
+                auto ksrc2 = __riscv_vget_v_f32m2x3_f32m2(src, 2);
+
+                auto diff = __riscv_vfmul(__riscv_vfadd(__riscv_vfadd(__riscv_vfabs(__riscv_vfsub(src0, ksrc0, vl), vl), __riscv_vfabs(__riscv_vfsub(src1, ksrc1, vl), vl), vl), __riscv_vfabs(__riscv_vfsub(src2, ksrc2, vl), vl), vl), scale_index, vl);
+                auto idx = __riscv_vfcvt_rtz_x(diff, vl);
+                auto alpha = __riscv_vfsub(diff, __riscv_vfcvt_f(idx, vl), vl);
+
+                auto exp = __riscv_vloxseg2ei32_v_f32m2x2(expLUT, __riscv_vreinterpret_v_i32m2_u32m2(__riscv_vmul(idx, sizeof(float), vl)), vl);
+                auto w = __riscv_vfmadd(alpha, __riscv_vfsub(__riscv_vget_v_f32m2x2_f32m2(exp, 1), __riscv_vget_v_f32m2x2_f32m2(exp, 0), vl), __riscv_vget_v_f32m2x2_f32m2(exp, 0), vl);
+                w = __riscv_vfmul(w, space_weight[k], vl);
+
+                __riscv_vse32(wsum + j, __riscv_vfadd(w, __riscv_vle32_v_f32m2(wsum + j, vl), vl), vl);
+                __riscv_vse32(sum_b + j, __riscv_vfmadd(w, ksrc0, __riscv_vle32_v_f32m2(sum_b + j, vl), vl), vl);
+                __riscv_vse32(sum_g + j, __riscv_vfmadd(w, ksrc1, __riscv_vle32_v_f32m2(sum_g + j, vl), vl), vl);
+                __riscv_vse32(sum_r + j, __riscv_vfmadd(w, ksrc2, __riscv_vle32_v_f32m2(sum_r + j, vl), vl), vl);
+            }
+        }
+
+        int vl;
+        for (int j = 0; j < width; j += vl)
+        {
+            vl = __riscv_vsetvl_e32m2(width - j);
+            auto w = __riscv_vfrdiv(__riscv_vfadd(__riscv_vle32_v_f32m2(wsum + j, vl), 1, vl), 1, vl);
+            auto src = __riscv_vlseg3e32_v_f32m2x3(sptr + j * 3, vl);
+            auto src0 = __riscv_vget_v_f32m2x3_f32m2(src, 0);
+            auto src1 = __riscv_vget_v_f32m2x3_f32m2(src, 1);
+            auto src2 = __riscv_vget_v_f32m2x3_f32m2(src, 2);
+
+            vfloat32m2x3_t dst{};
+            dst = __riscv_vset_v_f32m2_f32m2x3(dst, 0, __riscv_vfmul(w, __riscv_vfadd(__riscv_vle32_v_f32m2(sum_b + j, vl), src0, vl), vl));
+            dst = __riscv_vset_v_f32m2_f32m2x3(dst, 1, __riscv_vfmul(w, __riscv_vfadd(__riscv_vle32_v_f32m2(sum_g + j, vl), src1, vl), vl));
+            dst = __riscv_vset_v_f32m2_f32m2x3(dst, 2, __riscv_vfmul(w, __riscv_vfadd(__riscv_vle32_v_f32m2(sum_r + j, vl), src2, vl), vl));
+            __riscv_vsseg3e32(reinterpret_cast<float*>(dst_data + i * dst_step) + j * 3, dst, vl);
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+} // anonymous
+
+// the algorithm is copied from imgproc/src/bilateral_filter.dispatch.cpp
+// in the function static void bilateralFilter_8u and bilateralFilter_32f
+int bilateralFilter(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step,
+                    int width, int height, int depth, int cn, int d, double sigma_color, double sigma_space, int border_type)
+{
+    const int type = CV_MAKETYPE(depth, cn);
+    if (type != CV_8UC1 && type != CV_8UC3 && type != CV_32FC1 && type != CV_32FC3)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    if (type == CV_32FC1 && width * height > 1 << 20)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    if (src_data == dst_data || border_type & BORDER_ISOLATED)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    sigma_color = sigma_color <= 0 ? 1 : sigma_color;
+    sigma_space = sigma_space <= 0 ? 1 : sigma_space;
+    double gauss_color_coeff = -0.5/(sigma_color*sigma_color);
+    double gauss_space_coeff = -0.5/(sigma_space*sigma_space);
+    int radius = d <= 0 ? std::round(sigma_space*1.5) : d/2;
+    radius = std::max(radius, 1);
+    d = radius*2 + 1;
+
+    const int size = depth == CV_32F ? cn * sizeof(float) : cn;
+    const int temp_step = (width + radius * 2) * size;
+    std::vector<uchar> _temp((width + radius * 2) * (height + radius * 2) * size, 0);
+    uchar* temp = _temp.data();
+    std::vector<int> width_interpolate(radius * 2);
+    for (int j = 0; j < radius; j++)
+    {
+        width_interpolate[j] = common::borderInterpolate(j - radius, width, border_type);
+        width_interpolate[j + radius] = common::borderInterpolate(width + j, width, border_type);
+    }
+    for (int i = 0; i < height + radius * 2; i++)
+    {
+        int x = common::borderInterpolate(i - radius, height, border_type);
+        if (x != -1)
+        {
+            for (int j = 0; j < radius; j++)
+            {
+                int y = width_interpolate[j];
+                if (y != -1)
+                    memcpy(temp + i * temp_step + j * size, src_data + x * src_step + y * size, size);
+                y = width_interpolate[j + radius];
+                if (y != -1)
+                    memcpy(temp + i * temp_step + (width + j + radius) * size, src_data + x * src_step + y * size, size);
+            }
+            memcpy(temp + i * temp_step + radius * size, src_data + x * src_step, width * size);
+        }
+    }
+
+    std::vector<float> _space_weight(d*d);
+    std::vector<int> _space_ofs(d*d);
+    float* space_weight = _space_weight.data();
+    int* space_ofs = _space_ofs.data();
+    int maxk = 0;
+    for (int i = -radius; i <= radius; i++)
+    {
+        for (int j = -radius; j <= radius; j++)
+        {
+            double r = std::sqrt((double)i*i + (double)j*j);
+            if (r <= radius && (depth == CV_8U || i != 0 || j != 0))
+            {
+                space_weight[maxk] = static_cast<float>(r*r*gauss_space_coeff);
+                space_ofs[maxk++] = (i * (temp_step / size) + j) * cn;
+            }
+        }
+    }
+    cv::rvv_hal::core::exp32f(space_weight, space_weight, maxk);
+
+    if (depth == CV_8U)
+    {
+        std::vector<float> _color_weight(cn*256);
+        float* color_weight = _color_weight.data();
+        for (int i = 0; i < 256*cn; i++)
+            color_weight[i] = static_cast<float>(i*i*gauss_color_coeff);
+        cv::rvv_hal::core::exp32f(color_weight, color_weight, 256*cn);
+
+        switch (cn)
+        {
+        case 1:
+            return common::invoke(height, {bilateralFilter8UC1}, temp, temp_step, dst_data, dst_step, width, radius, maxk, space_ofs, space_weight, color_weight);
+        case 3:
+            return common::invoke(height, {bilateralFilter8UC3}, temp, temp_step, dst_data, dst_step, width, radius, maxk, space_ofs, space_weight, color_weight);
+        }
+    }
+    else
+    {
+        double minValSrc = -1, maxValSrc = 1;
+        cv::rvv_hal::core::minMaxIdx(src_data, src_step, width * cn, height, CV_32F, &minValSrc, &maxValSrc, nullptr, nullptr, nullptr);
+        if(std::abs(minValSrc - maxValSrc) < FLT_EPSILON)
+        {
+            for (int i = 0; i < width; i++)
+                memcpy(dst_data + i * dst_step, src_data + i * src_step, width * size);
+            return CV_HAL_ERROR_OK;
+        }
+
+        const int kExpNumBinsPerChannel = 1 << 12;
+        const int kExpNumBins = kExpNumBinsPerChannel * cn;
+        const float scale_index = kExpNumBins / static_cast<float>((maxValSrc - minValSrc) * cn);
+        std::vector<float> _expLUT(kExpNumBins+2, 0);
+        float* expLUT = _expLUT.data();
+        for (int i = 0; i < kExpNumBins+2; i++)
+        {
+            double val = i / scale_index;
+            expLUT[i] = static_cast<float>(val * val * gauss_color_coeff);
+        }
+        cv::rvv_hal::core::exp32f(expLUT, expLUT, kExpNumBins+2);
+
+        switch (cn)
+        {
+        case 1:
+            return common::invoke(height, {bilateralFilter32FC1}, temp, temp_step, dst_data, dst_step, width, radius, maxk, space_ofs, space_weight, expLUT, scale_index);
+        case 3:
+            return common::invoke(height, {bilateralFilter32FC3}, temp, temp_step, dst_data, dst_step, width, radius, maxk, space_ofs, space_weight, expLUT, scale_index);
+        }
+    }
+
+    return CV_HAL_ERROR_NOT_IMPLEMENTED;
+}
+
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+}}} // cv::rvv_hal::imgproc
--- a/hal/riscv-rvv/src/imgproc/box_filter.cpp
+++ b/hal/riscv-rvv/src/imgproc/box_filter.cpp
@ -0,0 +1,392 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
+
+#include "rvv_hal.hpp"
+#include "common.hpp"
+
+namespace cv { namespace rvv_hal { namespace imgproc {
+
+#if CV_HAL_RVV_1P0_ENABLED
+
+namespace {
+
+template<typename T> struct rvv;
+template<> struct rvv<uchar>
+{
+    static inline vuint16m8_t vcvt0(vuint8m4_t a, size_t b) { return __riscv_vzext_vf2(a, b); }
+    static inline vuint8m4_t vcvt1(vuint16m8_t a, size_t b) { return __riscv_vnclipu(a, 0, __RISCV_VXRM_RNU, b); }
+    static inline vuint16m8_t vdiv(vuint16m8_t a, ushort b, size_t c) { return __riscv_vdivu(__riscv_vadd(a, b / 2, c), b, c); }
+};
+template<> struct rvv<short>
+{
+    static inline vint32m8_t vcvt0(vint16m4_t a, size_t b) { return __riscv_vsext_vf2(a, b); }
+    static inline vint16m4_t vcvt1(vint32m8_t a, size_t b) { return __riscv_vnclip(a, 0, __RISCV_VXRM_RNU, b); }
+    static inline vint32m8_t vdiv(vint32m8_t a, int b, size_t c) { return __riscv_vdiv(__riscv_vadd(a, b / 2, c), b, c); }
+};
+template<> struct rvv<int>
+{
+    static inline vint32m8_t vcvt0(vint32m8_t a, size_t) { return a; }
+    static inline vint32m8_t vcvt1(vint32m8_t a, size_t) { return a; }
+    static inline vint32m8_t vdiv(vint32m8_t a, int b, size_t c) { return __riscv_vdiv(__riscv_vadd(a, b / 2, c), b, c); }
+};
+template<> struct rvv<float>
+{
+    static inline vfloat32m8_t vcvt0(vfloat32m8_t a, size_t) { return a; }
+    static inline vfloat32m8_t vcvt1(vfloat32m8_t a, size_t) { return a; }
+    static inline vfloat32m8_t vdiv(vfloat32m8_t a, float b, size_t c) { return __riscv_vfdiv(a, b, c); }
+};
+
+// the algorithm is same as cv_hal_sepFilter
+template<int ksize, typename helperT, typename helperWT, bool cast>
+static inline int boxFilterC1(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int full_width, int full_height, int offset_x, int offset_y, int anchor_x, int anchor_y, bool normalize, int border_type)
+{
+    using T = typename helperT::ElemType;
+    using WT = typename helperWT::ElemType;
+
+    constexpr int noval = std::numeric_limits<int>::max();
+    auto accessX = [&](int x) {
+        int pi = common::borderInterpolate(offset_y + x - anchor_y, full_height, border_type);
+        return pi < 0 ? noval : pi - offset_y;
+    };
+    auto accessY = [&](int y) {
+        int pj = common::borderInterpolate(offset_x + y - anchor_x, full_width, border_type);
+        return pj < 0 ? noval : pj - offset_x;
+    };
+    auto p2idx = [&](int x, int y){ return (x + ksize) % ksize * width + y; };
+
+    std::vector<WT> res(width * ksize);
+    auto process = [&](int x, int y) {
+        WT sum = 0;
+        for (int i = 0; i < ksize; i++)
+        {
+            int p = accessY(y + i);
+            if (p != noval)
+            {
+                sum += reinterpret_cast<const T*>(src_data + x * src_step)[p];
+            }
+        }
+        res[p2idx(x, y)] = sum;
+    };
+
+    const int left = anchor_x, right = width - (ksize - 1 - anchor_x);
+    for (int i = start - anchor_y; i < end + (ksize - 1 - anchor_y); i++)
+    {
+        if (i + offset_y >= 0 && i + offset_y < full_height)
+        {
+            if (left >= right)
+            {
+                for (int j = 0; j < width; j++)
+                    process(i, j);
+            }
+            else
+            {
+                for (int j = 0; j < left; j++)
+                    process(i, j);
+                for (int j = right; j < width; j++)
+                    process(i, j);
+
+                int vl;
+                for (int j = left; j < right; j += vl)
+                {
+                    vl = helperT::setvl(right - j);
+                    const T* extra = reinterpret_cast<const T*>(src_data + i * src_step) + j - anchor_x;
+                    auto src = rvv<T>::vcvt0(helperT::vload(extra, vl), vl);
+
+                    extra += vl;
+                    auto sum = src;
+                    src = helperWT::vslide1down(src, extra[0], vl);
+                    sum = helperWT::vadd(sum, src, vl);
+                    src = helperWT::vslide1down(src, extra[1], vl);
+                    sum = helperWT::vadd(sum, src, vl);
+                    if (ksize == 5)
+                    {
+                        src = helperWT::vslide1down(src, extra[2], vl);
+                        sum = helperWT::vadd(sum, src, vl);
+                        src = helperWT::vslide1down(src, extra[3], vl);
+                        sum = helperWT::vadd(sum, src, vl);
+                    }
+                    helperWT::vstore(res.data() + p2idx(i, j), sum, vl);
+                }
+            }
+        }
+
+        int cur = i - (ksize - 1 - anchor_y);
+        if (cur >= start)
+        {
+            const WT* row0 = accessX(cur    ) == noval ? nullptr : res.data() + p2idx(accessX(cur    ), 0);
+            const WT* row1 = accessX(cur + 1) == noval ? nullptr : res.data() + p2idx(accessX(cur + 1), 0);
+            const WT* row2 = accessX(cur + 2) == noval ? nullptr : res.data() + p2idx(accessX(cur + 2), 0);
+            const WT* row3 = nullptr, *row4 = nullptr;
+            if (ksize == 5)
+            {
+                row3 = accessX(cur + 3) == noval ? nullptr : res.data() + p2idx(accessX(cur + 3), 0);
+                row4 = accessX(cur + 4) == noval ? nullptr : res.data() + p2idx(accessX(cur + 4), 0);
+            }
+
+            int vl;
+            for (int j = 0; j < width; j += vl)
+            {
+                vl = helperWT::setvl(width - j);
+                auto sum = row0 ? helperWT::vload(row0 + j, vl) : helperWT::vmv(0, vl);
+                if (row1) sum = helperWT::vadd(sum, helperWT::vload(row1 + j, vl), vl);
+                if (row2) sum = helperWT::vadd(sum, helperWT::vload(row2 + j, vl), vl);
+                if (row3) sum = helperWT::vadd(sum, helperWT::vload(row3 + j, vl), vl);
+                if (row4) sum = helperWT::vadd(sum, helperWT::vload(row4 + j, vl), vl);
+                if (normalize) sum = rvv<T>::vdiv(sum, ksize * ksize, vl);
+
+                if (cast)
+                {
+                    helperT::vstore(reinterpret_cast<T*>(dst_data + cur * dst_step) + j, rvv<T>::vcvt1(sum, vl), vl);
+                }
+                else
+                {
+                    helperWT::vstore(reinterpret_cast<WT*>(dst_data + cur * dst_step) + j, sum, vl);
+                }
+            }
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+template<int ksize>
+static inline int boxFilterC3(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int full_width, int full_height, int offset_x, int offset_y, int anchor_x, int anchor_y, bool normalize, int border_type)
+{
+    constexpr int noval = std::numeric_limits<int>::max();
+    auto accessX = [&](int x) {
+        int pi = common::borderInterpolate(offset_y + x - anchor_y, full_height, border_type);
+        return pi < 0 ? noval : pi - offset_y;
+    };
+    auto accessY = [&](int y) {
+        int pj = common::borderInterpolate(offset_x + y - anchor_x, full_width, border_type);
+        return pj < 0 ? noval : pj - offset_x;
+    };
+    auto p2idx = [&](int x, int y){ return ((x + ksize) % ksize * width + y) * 3; };
+
+    std::vector<float> res(width * ksize * 3);
+    auto process = [&](int x, int y) {
+        float sum0, sum1, sum2;
+        sum0 = sum1 = sum2 = 0;
+        for (int i = 0; i < ksize; i++)
+        {
+            int p = accessY(y + i);
+            if (p != noval)
+            {
+                sum0 += reinterpret_cast<const float*>(src_data + x * src_step)[p * 3    ];
+                sum1 += reinterpret_cast<const float*>(src_data + x * src_step)[p * 3 + 1];
+                sum2 += reinterpret_cast<const float*>(src_data + x * src_step)[p * 3 + 2];
+            }
+        }
+        res[p2idx(x, y)    ] = sum0;
+        res[p2idx(x, y) + 1] = sum1;
+        res[p2idx(x, y) + 2] = sum2;
+    };
+
+    const int left = anchor_x, right = width - (ksize - 1 - anchor_x);
+    for (int i = start - anchor_y; i < end + (ksize - 1 - anchor_y); i++)
+    {
+        if (i + offset_y >= 0 && i + offset_y < full_height)
+        {
+            if (left >= right)
+            {
+                for (int j = 0; j < width; j++)
+                    process(i, j);
+            }
+            else
+            {
+                for (int j = 0; j < left; j++)
+                    process(i, j);
+                for (int j = right; j < width; j++)
+                    process(i, j);
+
+                int vl;
+                for (int j = left; j < right; j += vl)
+                {
+                    vl = __riscv_vsetvl_e32m2(right - j);
+                    const float* extra = reinterpret_cast<const float*>(src_data + i * src_step) + (j - anchor_x) * 3;
+                    auto src = __riscv_vlseg3e32_v_f32m2x3(extra, vl);
+                    auto src0 = __riscv_vget_v_f32m2x3_f32m2(src, 0);
+                    auto src1 = __riscv_vget_v_f32m2x3_f32m2(src, 1);
+                    auto src2 = __riscv_vget_v_f32m2x3_f32m2(src, 2);
+
+                    extra += vl * 3;
+                    auto sum0 = src0, sum1 = src1, sum2 = src2;
+                    src0 = __riscv_vfslide1down(src0, extra[0], vl);
+                    src1 = __riscv_vfslide1down(src1, extra[1], vl);
+                    src2 = __riscv_vfslide1down(src2, extra[2], vl);
+                    sum0 = __riscv_vfadd(sum0, src0, vl);
+                    sum1 = __riscv_vfadd(sum1, src1, vl);
+                    sum2 = __riscv_vfadd(sum2, src2, vl);
+                    src0 = __riscv_vfslide1down(src0, extra[3], vl);
+                    src1 = __riscv_vfslide1down(src1, extra[4], vl);
+                    src2 = __riscv_vfslide1down(src2, extra[5], vl);
+                    sum0 = __riscv_vfadd(sum0, src0, vl);
+                    sum1 = __riscv_vfadd(sum1, src1, vl);
+                    sum2 = __riscv_vfadd(sum2, src2, vl);
+                    if (ksize == 5)
+                    {
+                        src0 = __riscv_vfslide1down(src0, extra[6], vl);
+                        src1 = __riscv_vfslide1down(src1, extra[7], vl);
+                        src2 = __riscv_vfslide1down(src2, extra[8], vl);
+                        sum0 = __riscv_vfadd(sum0, src0, vl);
+                        sum1 = __riscv_vfadd(sum1, src1, vl);
+                        sum2 = __riscv_vfadd(sum2, src2, vl);
+                        src0 = __riscv_vfslide1down(src0, extra[ 9], vl);
+                        src1 = __riscv_vfslide1down(src1, extra[10], vl);
+                        src2 = __riscv_vfslide1down(src2, extra[11], vl);
+                        sum0 = __riscv_vfadd(sum0, src0, vl);
+                        sum1 = __riscv_vfadd(sum1, src1, vl);
+                        sum2 = __riscv_vfadd(sum2, src2, vl);
+                    }
+
+                    vfloat32m2x3_t dst{};
+                    dst = __riscv_vset_v_f32m2_f32m2x3(dst, 0, sum0);
+                    dst = __riscv_vset_v_f32m2_f32m2x3(dst, 1, sum1);
+                    dst = __riscv_vset_v_f32m2_f32m2x3(dst, 2, sum2);
+                    __riscv_vsseg3e32(res.data() + p2idx(i, j), dst, vl);
+                }
+            }
+        }
+
+        int cur = i - (ksize - 1 - anchor_y);
+        if (cur >= start)
+        {
+            const float* row0 = accessX(cur    ) == noval ? nullptr : res.data() + p2idx(accessX(cur    ), 0);
+            const float* row1 = accessX(cur + 1) == noval ? nullptr : res.data() + p2idx(accessX(cur + 1), 0);
+            const float* row2 = accessX(cur + 2) == noval ? nullptr : res.data() + p2idx(accessX(cur + 2), 0);
+            const float* row3 = nullptr, *row4 = nullptr;
+            if (ksize == 5)
+            {
+                row3 = accessX(cur + 3) == noval ? nullptr : res.data() + p2idx(accessX(cur + 3), 0);
+                row4 = accessX(cur + 4) == noval ? nullptr : res.data() + p2idx(accessX(cur + 4), 0);
+            }
+
+            int vl;
+            for (int j = 0; j < width; j += vl)
+            {
+                vl = __riscv_vsetvl_e32m2(width - j);
+                vfloat32m2_t sum0, sum1, sum2;
+                sum0 = sum1 = sum2 = __riscv_vfmv_v_f_f32m2(0, vl);
+                auto loadres = [&](const float* row) {
+                    if (!row) return;
+                    auto src = __riscv_vlseg3e32_v_f32m2x3(row + j * 3, vl);
+                    sum0 = __riscv_vfadd(sum0, __riscv_vget_v_f32m2x3_f32m2(src, 0), vl);
+                    sum1 = __riscv_vfadd(sum1, __riscv_vget_v_f32m2x3_f32m2(src, 1), vl);
+                    sum2 = __riscv_vfadd(sum2, __riscv_vget_v_f32m2x3_f32m2(src, 2), vl);
+                };
+                loadres(row0);
+                loadres(row1);
+                loadres(row2);
+                loadres(row3);
+                loadres(row4);
+                if (normalize)
+                {
+                    sum0 = __riscv_vfdiv(sum0, ksize * ksize, vl);
+                    sum1 = __riscv_vfdiv(sum1, ksize * ksize, vl);
+                    sum2 = __riscv_vfdiv(sum2, ksize * ksize, vl);
+                }
+
+                vfloat32m2x3_t dst{};
+                dst = __riscv_vset_v_f32m2_f32m2x3(dst, 0, sum0);
+                dst = __riscv_vset_v_f32m2_f32m2x3(dst, 1, sum1);
+                dst = __riscv_vset_v_f32m2_f32m2x3(dst, 2, sum2);
+                __riscv_vsseg3e32(reinterpret_cast<float*>(dst_data + cur * dst_step) + j * 3, dst, vl);
+            }
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+} // anonymous
+
+int boxFilter(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int src_depth, int dst_depth, int cn, int margin_left, int margin_top, int margin_right, int margin_bottom, size_t ksize_width, size_t ksize_height, int anchor_x, int anchor_y, bool normalize, int border_type)
+{
+    const int src_type = CV_MAKETYPE(src_depth, cn), dst_type = CV_MAKETYPE(dst_depth, cn);
+    if (ksize_width != ksize_height || (ksize_width != 3 && ksize_width != 5))
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    if (border_type & BORDER_ISOLATED || border_type == BORDER_WRAP)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    uchar* _dst_data = dst_data;
+    size_t _dst_step = dst_step;
+    const size_t size = CV_ELEM_SIZE(dst_type);
+    std::vector<uchar> dst;
+    if (src_data == _dst_data)
+    {
+        dst = std::vector<uchar>(width * height * size);
+        dst_data = dst.data();
+        dst_step = width * size;
+    }
+
+    int res = CV_HAL_ERROR_NOT_IMPLEMENTED;
+    anchor_x = anchor_x < 0 ? ksize_width  / 2 : anchor_x;
+    anchor_y = anchor_y < 0 ? ksize_height / 2 : anchor_y;
+    if (src_type != dst_type)
+    {
+        if (src_type == CV_8UC1 && dst_type == CV_16UC1)
+        {
+            if (ksize_width == 3)
+            {
+                res = common::invoke(height, {boxFilterC1<3, RVV_U8M4, RVV_U16M8, false>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type);
+            }
+            if (ksize_width == 5)
+            {
+                res = common::invoke(height, {boxFilterC1<5, RVV_U8M4, RVV_U16M8, false>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type);
+            }
+        }
+    }
+    else
+    {
+        switch (ksize_width*100 + src_type)
+        {
+        case 300 + CV_8UC1:
+            res = common::invoke(height, {boxFilterC1<3, RVV_U8M4, RVV_U16M8, true>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type);
+            break;
+        case 500 + CV_8UC1:
+            res = common::invoke(height, {boxFilterC1<5, RVV_U8M4, RVV_U16M8, true>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type);
+            break;
+        case 300 + CV_16SC1:
+            res = common::invoke(height, {boxFilterC1<3, RVV_I16M4, RVV_I32M8, true>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type);
+            break;
+        case 500 + CV_16SC1:
+            res = common::invoke(height, {boxFilterC1<5, RVV_I16M4, RVV_I32M8, true>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type);
+            break;
+        case 300 + CV_32SC1:
+            res = common::invoke(height, {boxFilterC1<3, RVV_I32M8, RVV_I32M8, true>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type);
+            break;
+        case 500 + CV_32SC1:
+            res = common::invoke(height, {boxFilterC1<5, RVV_I32M8, RVV_I32M8, true>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type);
+            break;
+        case 300 + CV_32FC1:
+            res = common::invoke(height, {boxFilterC1<3, RVV_F32M8, RVV_F32M8, true>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type);
+            break;
+        case 500 + CV_32FC1:
+            res = common::invoke(height, {boxFilterC1<5, RVV_F32M8, RVV_F32M8, true>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type);
+            break;
+        case 300 + CV_32FC3:
+            res = common::invoke(height, {boxFilterC3<3>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type);
+            break;
+        case 500 + CV_32FC3:
+            res = common::invoke(height, {boxFilterC3<5>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, anchor_x, anchor_y, normalize, border_type);
+            break;
+        }
+    }
+    if (res == CV_HAL_ERROR_NOT_IMPLEMENTED)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    if (src_data == _dst_data)
+    {
+        for (int i = 0; i < height; i++)
+            memcpy(_dst_data + i * _dst_step, dst.data() + i * dst_step, dst_step);
+    }
+
+    return res;
+}
+
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+}}} // cv::rvv_hal::imgproc
--- a/hal/riscv-rvv/src/imgproc/color.cpp
+++ b/hal/riscv-rvv/src/imgproc/color.cpp
@ -4,12 +4,12 @@

 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.

-#ifndef OPENCV_HAL_RVV_COLOR_HPP_INCLUDED
-#define OPENCV_HAL_RVV_COLOR_HPP_INCLUDED
+#include "rvv_hal.hpp"
+#include <limits>

-#include <riscv_vector.h>
+namespace cv { namespace rvv_hal { namespace imgproc {

-namespace cv { namespace cv_hal_rvv {
+#if CV_HAL_RVV_1P0_ENABLED

 namespace color {
    class ColorInvoker : public ParallelLoopBody
@ -41,11 +41,9 @@ namespace color {
    {
        return val - std::remainder(val, 1.0);
    }
-} // cv::cv_hal_rvv::color
+} // cv::rvv_hal::color

 namespace BGRtoBGR {
-#undef cv_hal_cvtBGRtoBGR
-#define cv_hal_cvtBGRtoBGR cv::cv_hal_rvv::BGRtoBGR::cvtBGRtoBGR

 template<typename T> struct rvv;
 template<> struct rvv<uchar>
@ -206,27 +204,26 @@ static inline int cvtBGRtoBGR(int start, int end, const T * src, size_t src_step
    return CV_HAL_ERROR_OK;
 }

-inline int cvtBGRtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, int dcn, bool swapBlue)
+} // BGRtoBGR
+
+int cvtBGRtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, int dcn, bool swapBlue)
 {
    if ((scn != 3 && scn != 4) || (dcn != 3 && dcn != 4))
        return CV_HAL_ERROR_NOT_IMPLEMENTED;
    switch (depth)
    {
    case CV_8U:
-        return cvtBGRtoBGR<uchar>(0, height, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, scn, dcn, swapBlue);
+        return BGRtoBGR::cvtBGRtoBGR<uchar>(0, height, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, scn, dcn, swapBlue);
    case CV_16U:
-        return cvtBGRtoBGR<ushort>(0, height, reinterpret_cast<const ushort*>(src_data), src_step, reinterpret_cast<ushort*>(dst_data), dst_step, width, scn, dcn, swapBlue);
+        return BGRtoBGR::cvtBGRtoBGR<ushort>(0, height, reinterpret_cast<const ushort*>(src_data), src_step, reinterpret_cast<ushort*>(dst_data), dst_step, width, scn, dcn, swapBlue);
    case CV_32F:
-        return cvtBGRtoBGR<float>(0, height, reinterpret_cast<const float*>(src_data), src_step, reinterpret_cast<float*>(dst_data), dst_step, width, scn, dcn, swapBlue);
+        return BGRtoBGR::cvtBGRtoBGR<float>(0, height, reinterpret_cast<const float*>(src_data), src_step, reinterpret_cast<float*>(dst_data), dst_step, width, scn, dcn, swapBlue);
    }

    return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
-} // cv::cv_hal_rvv::BGRtoBGR

 namespace GraytoBGR {
-#undef cv_hal_cvtGraytoBGR
-#define cv_hal_cvtGraytoBGR cv::cv_hal_rvv::GraytoBGR::cvtGraytoBGR

 template<typename T> struct rvv;
 template<> struct rvv<uchar>
@ -337,27 +334,26 @@ static inline int cvtGraytoBGR(int start, int end, const T * src, size_t src_ste
    return CV_HAL_ERROR_OK;
 }

-inline int cvtGraytoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int dcn)
+} // GraytoBGR
+
+int cvtGraytoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int dcn)
 {
    if (dcn != 3 && dcn != 4)
        return CV_HAL_ERROR_NOT_IMPLEMENTED;
    switch (depth)
    {
    case CV_8U:
-        return cvtGraytoBGR<uchar>(0, height, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, dcn);
+        return GraytoBGR::cvtGraytoBGR<uchar>(0, height, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, dcn);
    case CV_16U:
-        return cvtGraytoBGR<ushort>(0, height, reinterpret_cast<const ushort*>(src_data), src_step, reinterpret_cast<ushort*>(dst_data), dst_step, width, dcn);
+        return GraytoBGR::cvtGraytoBGR<ushort>(0, height, reinterpret_cast<const ushort*>(src_data), src_step, reinterpret_cast<ushort*>(dst_data), dst_step, width, dcn);
    case CV_32F:
-        return cvtGraytoBGR<float>(0, height, reinterpret_cast<const float*>(src_data), src_step, reinterpret_cast<float*>(dst_data), dst_step, width, dcn);
+        return GraytoBGR::cvtGraytoBGR<float>(0, height, reinterpret_cast<const float*>(src_data), src_step, reinterpret_cast<float*>(dst_data), dst_step, width, dcn);
    }

    return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
-} // cv::cv_hal_rvv::GraytoBGR

 namespace BGRtoGray {
-#undef cv_hal_cvtBGRtoGray
-#define cv_hal_cvtBGRtoGray cv::cv_hal_rvv::BGRtoGray::cvtBGRtoGray

 template<typename T> struct rvv;
 template<> struct rvv<uchar>
@ -462,27 +458,26 @@ static inline int cvtBGRtoGray(int start, int end, const T * src, size_t src_ste
    return CV_HAL_ERROR_OK;
 }

-inline int cvtBGRtoGray(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue)
+} // BGRtoGray
+
+int cvtBGRtoGray(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue)
 {
    if (scn != 3 && scn != 4)
        return CV_HAL_ERROR_NOT_IMPLEMENTED;
    switch (depth)
    {
    case CV_8U:
-        return color::invoke(width, height, {cvtBGRtoGray<uchar>}, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, scn, swapBlue);
+        return color::invoke(width, height, {BGRtoGray::cvtBGRtoGray<uchar>}, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, scn, swapBlue);
    case CV_16U:
-        return color::invoke(width, height, {cvtBGRtoGray<ushort>}, reinterpret_cast<const ushort*>(src_data), src_step, reinterpret_cast<ushort*>(dst_data), dst_step, width, scn, swapBlue);
+        return color::invoke(width, height, {BGRtoGray::cvtBGRtoGray<ushort>}, reinterpret_cast<const ushort*>(src_data), src_step, reinterpret_cast<ushort*>(dst_data), dst_step, width, scn, swapBlue);
    case CV_32F:
-        return color::invoke(width, height, {cvtBGRtoGray<float>}, reinterpret_cast<const float*>(src_data), src_step, reinterpret_cast<float*>(dst_data), dst_step, width, scn, swapBlue);
+        return color::invoke(width, height, {BGRtoGray::cvtBGRtoGray<float>}, reinterpret_cast<const float*>(src_data), src_step, reinterpret_cast<float*>(dst_data), dst_step, width, scn, swapBlue);
    }

    return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
-} // cv::cv_hal_rvv::BGRtoGray

 namespace BGR5x5toBGR {
-#undef cv_hal_cvtBGR5x5toBGR
-#define cv_hal_cvtBGR5x5toBGR cv::cv_hal_rvv::BGR5x5toBGR::cvtBGR5x5toBGR

 // the algorithm is copied from imgproc/src/color_rgb.simd.cpp,
 // in the functor struct RGB5x52RGB
@ -540,18 +535,17 @@ static inline int cvtBGR5x5toBGR_u(int start, int end, const ushort * src, size_
    return CV_HAL_ERROR_OK;
 }

-inline int cvtBGR5x5toBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int dcn, bool swapBlue, int greenBits)
+} // BGR5x5toBGR
+
+int cvtBGR5x5toBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int dcn, bool swapBlue, int greenBits)
 {
    if ((dcn != 3 && dcn != 4) || (greenBits != 5 && greenBits != 6))
        return CV_HAL_ERROR_NOT_IMPLEMENTED;

-    return color::invoke(width, height, {cvtBGR5x5toBGR_u}, reinterpret_cast<const ushort*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, dcn, swapBlue, greenBits);
+    return color::invoke(width, height, {BGR5x5toBGR::cvtBGR5x5toBGR_u}, reinterpret_cast<const ushort*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, dcn, swapBlue, greenBits);
 }
-} // cv::cv_hal_rvv::BGR5x5toBGR

 namespace BGRtoBGR5x5 {
-#undef cv_hal_cvtBGRtoBGR5x5
-#define cv_hal_cvtBGRtoBGR5x5 cv::cv_hal_rvv::BGRtoBGR5x5::cvtBGRtoBGR5x5

 // the algorithm is copied from imgproc/src/color_rgb.simd.cpp,
 // in the functor struct RGB2RGB5x5
@ -604,18 +598,17 @@ static inline int cvtBGRtoBGR5x5_u(int start, int end, const uchar * src, size_t
    return CV_HAL_ERROR_OK;
 }

-inline int cvtBGRtoBGR5x5(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int scn, bool swapBlue, int greenBits)
+} // BGRtoBGR5x5
+
+int cvtBGRtoBGR5x5(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int scn, bool swapBlue, int greenBits)
 {
    if ((scn != 3 && scn != 4) || (greenBits != 5 && greenBits != 6))
        return CV_HAL_ERROR_NOT_IMPLEMENTED;

-    return color::invoke(width, height, {cvtBGRtoBGR5x5_u}, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<ushort*>(dst_data), dst_step, width, scn, swapBlue, greenBits);
+    return color::invoke(width, height, {BGRtoBGR5x5::cvtBGRtoBGR5x5_u}, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<ushort*>(dst_data), dst_step, width, scn, swapBlue, greenBits);
 }
-} // cv::cv_hal_rvv::BGRtoBGR5x5

 namespace BGR5x5toGray {
-#undef cv_hal_cvtBGR5x5toGray
-#define cv_hal_cvtBGR5x5toGray cv::cv_hal_rvv::BGR5x5toGray::cvtBGR5x5toGray

 // the algorithm is copied from imgproc/src/color_rgb.simd.cpp,
 // in the functor struct RGB5x52Gray
@ -654,18 +647,17 @@ static inline int cvtBGR5x5toGray_u(int start, int end, const ushort * src, size
    return CV_HAL_ERROR_OK;
 }

-inline int cvtBGR5x5toGray(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int greenBits)
+} // BGR5x5toGray
+
+int cvtBGR5x5toGray(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int greenBits)
 {
    if (greenBits != 5 && greenBits != 6)
        return CV_HAL_ERROR_NOT_IMPLEMENTED;

-    return color::invoke(width, height, {cvtBGR5x5toGray_u}, reinterpret_cast<const ushort*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, greenBits);
+    return color::invoke(width, height, {BGR5x5toGray::cvtBGR5x5toGray_u}, reinterpret_cast<const ushort*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, greenBits);
 }
-} // cv::cv_hal_rvv::BGR5x5toGray

 namespace GraytoBGR5x5 {
-#undef cv_hal_cvtGraytoBGR5x5
-#define cv_hal_cvtGraytoBGR5x5 cv::cv_hal_rvv::GraytoBGR5x5::cvtGraytoBGR5x5

 // the algorithm is copied from imgproc/src/color_rgb.simd.cpp,
 // in the functor struct Gray2RGB5x5
@ -697,18 +689,17 @@ static inline int cvtGraytoBGR5x5_u(int start, int end, const uchar * src, size_
    return CV_HAL_ERROR_OK;
 }

-inline int cvtGraytoBGR5x5(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int greenBits)
+} // GraytoBGR5x5
+
+int cvtGraytoBGR5x5(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int greenBits)
 {
    if (greenBits != 5 && greenBits != 6)
        return CV_HAL_ERROR_NOT_IMPLEMENTED;

-    return color::invoke(width, height, {cvtGraytoBGR5x5_u}, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<ushort*>(dst_data), dst_step, width, greenBits);
+    return color::invoke(width, height, {GraytoBGR5x5::cvtGraytoBGR5x5_u}, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<ushort*>(dst_data), dst_step, width, greenBits);
 }
-} // cv::cv_hal_rvv::GraytoBGR5x5

 namespace YUVtoBGR {
-#undef cv_hal_cvtYUVtoBGR
-#define cv_hal_cvtYUVtoBGR cv::cv_hal_rvv::YUVtoBGR::cvtYUVtoBGR

 template<typename T> struct rvv;
 template<> struct rvv<uchar>
@ -857,27 +848,26 @@ static inline int cvtYUVtoBGR(int start, int end, const T * src, size_t src_step
    return CV_HAL_ERROR_OK;
 }

-inline int cvtYUVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int dcn, bool swapBlue, bool isCbCr)
+} // YUVtoBGR
+
+int cvtYUVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int dcn, bool swapBlue, bool isCbCr)
 {
    if (dcn != 3 && dcn != 4)
        return CV_HAL_ERROR_NOT_IMPLEMENTED;
    switch (depth)
    {
    case CV_8U:
-        return color::invoke(width, height, {cvtYUVtoBGR<uchar>}, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, dcn, swapBlue, isCbCr);
+        return color::invoke(width, height, {YUVtoBGR::cvtYUVtoBGR<uchar>}, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, dcn, swapBlue, isCbCr);
    case CV_16U:
-        return color::invoke(width, height, {cvtYUVtoBGR<ushort>}, reinterpret_cast<const ushort*>(src_data), src_step, reinterpret_cast<ushort*>(dst_data), dst_step, width, dcn, swapBlue, isCbCr);
+        return color::invoke(width, height, {YUVtoBGR::cvtYUVtoBGR<ushort>}, reinterpret_cast<const ushort*>(src_data), src_step, reinterpret_cast<ushort*>(dst_data), dst_step, width, dcn, swapBlue, isCbCr);
    case CV_32F:
-        return color::invoke(width, height, {cvtYUVtoBGR<float>}, reinterpret_cast<const float*>(src_data), src_step, reinterpret_cast<float*>(dst_data), dst_step, width, dcn, swapBlue, isCbCr);
+        return color::invoke(width, height, {YUVtoBGR::cvtYUVtoBGR<float>}, reinterpret_cast<const float*>(src_data), src_step, reinterpret_cast<float*>(dst_data), dst_step, width, dcn, swapBlue, isCbCr);
    }

    return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
-} // cv::cv_hal_rvv::YUVtoBGR

 namespace BGRtoYUV {
-#undef cv_hal_cvtBGRtoYUV
-#define cv_hal_cvtBGRtoYUV cv::cv_hal_rvv::BGRtoYUV::cvtBGRtoYUV

 template<typename T> struct rvv;
 template<> struct rvv<uchar>
@ -1027,31 +1017,26 @@ static inline int cvtBGRtoYUV(int start, int end, const T * src, size_t src_step
    return CV_HAL_ERROR_OK;
 }

-inline int cvtBGRtoYUV(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue, bool isCbCr)
+} // BGRtoYUV
+
+int cvtBGRtoYUV(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue, bool isCbCr)
 {
    if (scn != 3 && scn != 4)
        return CV_HAL_ERROR_NOT_IMPLEMENTED;
    switch (depth)
    {
    case CV_8U:
-        return color::invoke(width, height, {cvtBGRtoYUV<uchar>}, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, scn, swapBlue, isCbCr);
+        return color::invoke(width, height, {BGRtoYUV::cvtBGRtoYUV<uchar>}, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, scn, swapBlue, isCbCr);
    case CV_16U:
-        return color::invoke(width, height, {cvtBGRtoYUV<ushort>}, reinterpret_cast<const ushort*>(src_data), src_step, reinterpret_cast<ushort*>(dst_data), dst_step, width, scn, swapBlue, isCbCr);
+        return color::invoke(width, height, {BGRtoYUV::cvtBGRtoYUV<ushort>}, reinterpret_cast<const ushort*>(src_data), src_step, reinterpret_cast<ushort*>(dst_data), dst_step, width, scn, swapBlue, isCbCr);
    case CV_32F:
-        return color::invoke(width, height, {cvtBGRtoYUV<float>}, reinterpret_cast<const float*>(src_data), src_step, reinterpret_cast<float*>(dst_data), dst_step, width, scn, swapBlue, isCbCr);
+        return color::invoke(width, height, {BGRtoYUV::cvtBGRtoYUV<float>}, reinterpret_cast<const float*>(src_data), src_step, reinterpret_cast<float*>(dst_data), dst_step, width, scn, swapBlue, isCbCr);
    }

    return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
-} // cv::cv_hal_rvv::BGRtoYUV

 namespace PlaneYUVtoBGR {
-#undef cv_hal_cvtOnePlaneYUVtoBGR
-#define cv_hal_cvtOnePlaneYUVtoBGR cv::cv_hal_rvv::PlaneYUVtoBGR::cvtOnePlaneYUVtoBGR
-#undef cv_hal_cvtTwoPlaneYUVtoBGR
-#define cv_hal_cvtTwoPlaneYUVtoBGR cv::cv_hal_rvv::PlaneYUVtoBGR::cvtTwoPlaneYUVtoBGR
-#undef cv_hal_cvtThreePlaneYUVtoBGR
-#define cv_hal_cvtThreePlaneYUVtoBGR cv::cv_hal_rvv::PlaneYUVtoBGR::cvtThreePlaneYUVtoBGR

 static const int ITUR_BT_601_SHIFT = 20;
 static const int ITUR_BT_601_CY  = 1220542;
@ -1241,22 +1226,24 @@ static inline int cvtMultiPlaneYUVtoBGR(int start, int end, uchar * dst_data, si
    return CV_HAL_ERROR_OK;
 }

-inline int cvtOnePlaneYUVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int dst_width, int dst_height, int dcn, bool swapBlue, int uIdx, int yIdx)
+} // PlaneYUVtoBGR
+
+int cvtOnePlaneYUVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int dst_width, int dst_height, int dcn, bool swapBlue, int uIdx, int yIdx)
 {
    if (dcn != 3 && dcn != 4)
        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-    return color::invoke(dst_width, dst_height, {cvtSinglePlaneYUVtoBGR}, dst_data, dst_step, dst_width, src_step, src_data, dcn, swapBlue, uIdx, yIdx);
+    return color::invoke(dst_width, dst_height, {PlaneYUVtoBGR::cvtSinglePlaneYUVtoBGR}, dst_data, dst_step, dst_width, src_step, src_data, dcn, swapBlue, uIdx, yIdx);
 }

-inline int cvtTwoPlaneYUVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int dst_width, int dst_height, int dcn, bool swapBlue, int uIdx)
+int cvtTwoPlaneYUVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int dst_width, int dst_height, int dcn, bool swapBlue, int uIdx)
 {
    if (dcn != 3 && dcn != 4)
        return CV_HAL_ERROR_NOT_IMPLEMENTED;
    const uchar* uv = src_data + src_step * static_cast<size_t>(dst_height);
-    return color::invoke(dst_width, dst_height / 2, {cvtMultiPlaneYUVtoBGR}, dst_data, dst_step, dst_width, src_step, src_data, uv, uv, 0, 0, dcn, swapBlue, uIdx);
+    return color::invoke(dst_width, dst_height / 2, {PlaneYUVtoBGR::cvtMultiPlaneYUVtoBGR}, dst_data, dst_step, dst_width, src_step, src_data, uv, uv, 0, 0, dcn, swapBlue, uIdx);
 }

-inline int cvtThreePlaneYUVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int dst_width, int dst_height, int dcn, bool swapBlue, int uIdx)
+int cvtThreePlaneYUVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int dst_width, int dst_height, int dcn, bool swapBlue, int uIdx)
 {
    if (dcn != 3 && dcn != 4)
        return CV_HAL_ERROR_NOT_IMPLEMENTED;
@ -1267,17 +1254,10 @@ inline int cvtThreePlaneYUVtoBGR(const uchar * src_data, size_t src_step, uchar
    int vstepIdx = dst_height % 4 == 2 ? 1 : 0;
    if (uIdx == 1) { std::swap(u ,v), std::swap(ustepIdx, vstepIdx); }

-    return color::invoke(dst_width, dst_height / 2, {cvtMultiPlaneYUVtoBGR}, dst_data, dst_step, dst_width, src_step, src_data, u, v, ustepIdx, vstepIdx, dcn, swapBlue, -1);
+    return color::invoke(dst_width, dst_height / 2, {PlaneYUVtoBGR::cvtMultiPlaneYUVtoBGR}, dst_data, dst_step, dst_width, src_step, src_data, u, v, ustepIdx, vstepIdx, dcn, swapBlue, -1);
 }
-} // cv::cv_hal_rvv::PlaneYUVtoBGR

 namespace PlaneBGRtoYUV {
-#undef cv_hal_cvtOnePlaneBGRtoYUV
-#define cv_hal_cvtOnePlaneBGRtoYUV cv::cv_hal_rvv::PlaneBGRtoYUV::cvtOnePlaneBGRtoYUV
-#undef cv_hal_cvtBGRtoTwoPlaneYUV
-#define cv_hal_cvtBGRtoTwoPlaneYUV cv::cv_hal_rvv::PlaneBGRtoYUV::cvtBGRtoTwoPlaneYUV
-#undef cv_hal_cvtBGRtoThreePlaneYUV
-#define cv_hal_cvtBGRtoThreePlaneYUV cv::cv_hal_rvv::PlaneBGRtoYUV::cvtBGRtoThreePlaneYUV

 static const int ITUR_BT_601_SHIFT = 20;
 static const int ITUR_BT_601_CBY =  102760; // 0.114035 * (236-16)/256 * (1 << ITUR_BT_601_SHIFT)
@ -1512,35 +1492,34 @@ static inline int cvtBGRtoMultiPlaneYUV(int start, int end, uchar * yData, uchar
    return CV_HAL_ERROR_OK;
 }

-inline int cvtOnePlaneBGRtoYUV(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int scn, bool swapBlue, int uIdx, int yIdx)
+} // PlaneBGRtoYUV
+
+int cvtOnePlaneBGRtoYUV(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int scn, bool swapBlue, int uIdx, int yIdx)
 {
    if (scn != 3 && scn != 4)
        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-    return color::invoke(width, height, {cvtBGRtoSinglePlaneYUV}, dst_data, dst_step, width, src_step, src_data, scn, swapBlue, uIdx, yIdx);
+    return color::invoke(width, height, {PlaneBGRtoYUV::cvtBGRtoSinglePlaneYUV}, dst_data, dst_step, width, src_step, src_data, scn, swapBlue, uIdx, yIdx);
 }

-inline int cvtBGRtoTwoPlaneYUV(const uchar * src_data, size_t src_step,
+int cvtBGRtoTwoPlaneYUV(const uchar * src_data, size_t src_step,
                               uchar * y_data, size_t y_step, uchar * uv_data, size_t uv_step,
                               int width, int height,
                               int scn, bool swapBlue, int uIdx)
 {
    if (y_step != uv_step || (scn != 3 && scn != 4))
        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-    return color::invoke(width, height / 2, {cvtBGRtoMultiPlaneYUV}, y_data, uv_data, y_step, width, height, src_step, src_data, scn, swapBlue, uIdx == 2);
+    return color::invoke(width, height / 2, {PlaneBGRtoYUV::cvtBGRtoMultiPlaneYUV}, y_data, uv_data, y_step, width, height, src_step, src_data, scn, swapBlue, uIdx == 2);
 }

-inline int cvtBGRtoThreePlaneYUV(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int scn, bool swapBlue, int uIdx)
+int cvtBGRtoThreePlaneYUV(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int scn, bool swapBlue, int uIdx)
 {
    if (scn != 3 && scn != 4)
        return CV_HAL_ERROR_NOT_IMPLEMENTED;
    uchar* uv_data = dst_data + dst_step * static_cast<size_t>(height);
-    return color::invoke(width, height / 2, {cvtBGRtoMultiPlaneYUV}, dst_data, uv_data, dst_step, width, height, src_step, src_data, scn, swapBlue, uIdx == 2 ? 3 : 2);
+    return color::invoke(width, height / 2, {PlaneBGRtoYUV::cvtBGRtoMultiPlaneYUV}, dst_data, uv_data, dst_step, width, height, src_step, src_data, scn, swapBlue, uIdx == 2 ? 3 : 2);
 }
-} // cv::cv_hal_rvv::PlaneBGRtoYUV

 namespace HSVtoBGR {
-#undef cv_hal_cvtHSVtoBGR
-#define cv_hal_cvtHSVtoBGR cv::cv_hal_rvv::HSVtoBGR::cvtHSVtoBGR

 template<typename T>
 static inline int cvtHSVtoBGR(int start, int end, const T * src, size_t src_step, T * dst, size_t dst_step, int width, int dcn, bool swapBlue, bool isFullRange, bool isHSV);
@ -1710,25 +1689,24 @@ inline int cvtHSVtoBGR<float>(int start, int end, const float * src, size_t src_
    return CV_HAL_ERROR_OK;
 }

-inline int cvtHSVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int dcn, bool swapBlue, bool isFullRange, bool isHSV)
+} // HSVtoBGR
+
+int cvtHSVtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int dcn, bool swapBlue, bool isFullRange, bool isHSV)
 {
    if (dcn != 3 && dcn != 4)
        return CV_HAL_ERROR_NOT_IMPLEMENTED;
    switch (depth)
    {
    case CV_8U:
-        return color::invoke(width, height, {cvtHSVtoBGR<uchar>}, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, dcn, swapBlue, isFullRange, isHSV);
+        return color::invoke(width, height, {HSVtoBGR::cvtHSVtoBGR<uchar>}, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, dcn, swapBlue, isFullRange, isHSV);
    case CV_32F:
-        return color::invoke(width, height, {cvtHSVtoBGR<float>}, reinterpret_cast<const float*>(src_data), src_step, reinterpret_cast<float*>(dst_data), dst_step, width, dcn, swapBlue, isFullRange, isHSV);
+        return color::invoke(width, height, {HSVtoBGR::cvtHSVtoBGR<float>}, reinterpret_cast<const float*>(src_data), src_step, reinterpret_cast<float*>(dst_data), dst_step, width, dcn, swapBlue, isFullRange, isHSV);
    }

    return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
-} // cv::cv_hal_rvv::HSVtoBGR

 namespace BGRtoHSV {
-#undef cv_hal_cvtBGRtoHSV
-#define cv_hal_cvtBGRtoHSV cv::cv_hal_rvv::BGRtoHSV::cvtBGRtoHSV

 template<typename T>
 static inline int cvtBGRtoHSV(int start, int end, const T * src, size_t src_step, T * dst, size_t dst_step, int width, int scn, bool swapBlue, bool isFullRange, bool isHSV);
@ -1870,25 +1848,24 @@ inline int cvtBGRtoHSV<float>(int start, int end, const float * src, size_t src_
    return CV_HAL_ERROR_OK;
 }

-inline int cvtBGRtoHSV(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue, bool isFullRange, bool isHSV)
+} // BGRtoHSV
+
+int cvtBGRtoHSV(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue, bool isFullRange, bool isHSV)
 {
    if (scn != 3 && scn != 4)
        return CV_HAL_ERROR_NOT_IMPLEMENTED;
    switch (depth)
    {
    case CV_8U:
-        return color::invoke(width, height, {cvtBGRtoHSV<uchar>}, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, scn, swapBlue, isFullRange, isHSV);
+        return color::invoke(width, height, {BGRtoHSV::cvtBGRtoHSV<uchar>}, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, scn, swapBlue, isFullRange, isHSV);
    case CV_32F:
-        return color::invoke(width, height, {cvtBGRtoHSV<float>}, reinterpret_cast<const float*>(src_data), src_step, reinterpret_cast<float*>(dst_data), dst_step, width, scn, swapBlue, isFullRange, isHSV);
+        return color::invoke(width, height, {BGRtoHSV::cvtBGRtoHSV<float>}, reinterpret_cast<const float*>(src_data), src_step, reinterpret_cast<float*>(dst_data), dst_step, width, scn, swapBlue, isFullRange, isHSV);
    }

    return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
-} // cv::cv_hal_rvv::BGRtoHSV

 namespace XYZtoBGR {
-#undef cv_hal_cvtXYZtoBGR
-#define cv_hal_cvtXYZtoBGR cv::cv_hal_rvv::XYZtoBGR::cvtXYZtoBGR

 template<typename T> struct rvv;
 template<> struct rvv<uchar>
@ -2042,27 +2019,26 @@ static inline int cvtXYZtoBGR(int start, int end, const T * src, size_t src_step
    return CV_HAL_ERROR_OK;
 }

-inline int cvtXYZtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int dcn, bool swapBlue)
+} // XYZtoBGR
+
+int cvtXYZtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int dcn, bool swapBlue)
 {
    if (dcn != 3 && dcn != 4)
        return CV_HAL_ERROR_NOT_IMPLEMENTED;
    switch (depth)
    {
    case CV_8U:
-        return color::invoke(width, height, {cvtXYZtoBGR<uchar>}, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, dcn, swapBlue);
+        return color::invoke(width, height, {XYZtoBGR::cvtXYZtoBGR<uchar>}, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, dcn, swapBlue);
    case CV_16U:
-        return color::invoke(width, height, {cvtXYZtoBGR<ushort>}, reinterpret_cast<const ushort*>(src_data), src_step, reinterpret_cast<ushort*>(dst_data), dst_step, width, dcn, swapBlue);
+        return color::invoke(width, height, {XYZtoBGR::cvtXYZtoBGR<ushort>}, reinterpret_cast<const ushort*>(src_data), src_step, reinterpret_cast<ushort*>(dst_data), dst_step, width, dcn, swapBlue);
    case CV_32F:
-        return color::invoke(width, height, {cvtXYZtoBGR<float>}, reinterpret_cast<const float*>(src_data), src_step, reinterpret_cast<float*>(dst_data), dst_step, width, dcn, swapBlue);
+        return color::invoke(width, height, {XYZtoBGR::cvtXYZtoBGR<float>}, reinterpret_cast<const float*>(src_data), src_step, reinterpret_cast<float*>(dst_data), dst_step, width, dcn, swapBlue);
    }

    return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
-} // cv::cv_hal_rvv::XYZtoBGR

 namespace BGRtoXYZ {
-#undef cv_hal_cvtBGRtoXYZ
-#define cv_hal_cvtBGRtoXYZ cv::cv_hal_rvv::BGRtoXYZ::cvtBGRtoXYZ

 template<typename T> struct rvv;
 template<> struct rvv<uchar>
@ -2209,23 +2185,24 @@ static inline int cvtBGRtoXYZ(int start, int end, const T * src, size_t src_step
    return CV_HAL_ERROR_OK;
 }

-inline int cvtBGRtoXYZ(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue)
+} // BGRtoXYZ
+
+int cvtBGRtoXYZ(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue)
 {
    if (scn != 3 && scn != 4)
        return CV_HAL_ERROR_NOT_IMPLEMENTED;
    switch (depth)
    {
    case CV_8U:
-        return color::invoke(width, height, {cvtBGRtoXYZ<uchar>}, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, scn, swapBlue);
+        return color::invoke(width, height, {BGRtoXYZ::cvtBGRtoXYZ<uchar>}, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, scn, swapBlue);
    case CV_16U:
-        return color::invoke(width, height, {cvtBGRtoXYZ<ushort>}, reinterpret_cast<const ushort*>(src_data), src_step, reinterpret_cast<ushort*>(dst_data), dst_step, width, scn, swapBlue);
+        return color::invoke(width, height, {BGRtoXYZ::cvtBGRtoXYZ<ushort>}, reinterpret_cast<const ushort*>(src_data), src_step, reinterpret_cast<ushort*>(dst_data), dst_step, width, scn, swapBlue);
    case CV_32F:
-        return color::invoke(width, height, {cvtBGRtoXYZ<float>}, reinterpret_cast<const float*>(src_data), src_step, reinterpret_cast<float*>(dst_data), dst_step, width, scn, swapBlue);
+        return color::invoke(width, height, {BGRtoXYZ::cvtBGRtoXYZ<float>}, reinterpret_cast<const float*>(src_data), src_step, reinterpret_cast<float*>(dst_data), dst_step, width, scn, swapBlue);
    }

    return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
-} // cv::cv_hal_rvv::BGRtoXYZ

 namespace LabTable
 {
@ -2495,11 +2472,9 @@ namespace LabTable
            return __riscv_vfmadd(__riscv_vfmadd(__riscv_vfmadd(__riscv_vget_v_f32m2x4_f32m2(val, 3), x, __riscv_vget_v_f32m2x4_f32m2(val, 2), vl), x, __riscv_vget_v_f32m2x4_f32m2(val, 1), vl), x, __riscv_vget_v_f32m2x4_f32m2(val, 0), vl);
        }
    };
-} // cv::cv_hal_rvv::LabTable
+} // cv::rvv_hal::imgproc::LabTable

 namespace LabtoBGR {
-#undef cv_hal_cvtLabtoBGR
-#define cv_hal_cvtLabtoBGR cv::cv_hal_rvv::LabtoBGR::cvtLabtoBGR

 template<typename T>
 static inline int cvtLabtoBGR(int start, int end, const T * src, size_t src_step, T * dst, size_t dst_step, int width, int dcn, bool swapBlue, bool isLab, bool srgb);
@ -2713,25 +2688,24 @@ inline int cvtLabtoBGR<float>(int start, int end, const float * src, size_t src_
    return CV_HAL_ERROR_OK;
 }

-inline int cvtLabtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int dcn, bool swapBlue, bool isLab, bool srgb)
+} // LabtoBGR
+
+int cvtLabtoBGR(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int dcn, bool swapBlue, bool isLab, bool srgb)
 {
    if (dcn != 3 && dcn != 4)
        return CV_HAL_ERROR_NOT_IMPLEMENTED;
    switch (depth)
    {
    case CV_8U:
-        return color::invoke(width, height, {cvtLabtoBGR<uchar>}, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, dcn, swapBlue, isLab, srgb);
+        return color::invoke(width, height, {LabtoBGR::cvtLabtoBGR<uchar>}, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, dcn, swapBlue, isLab, srgb);
    case CV_32F:
-        return color::invoke(width, height, {cvtLabtoBGR<float>}, reinterpret_cast<const float*>(src_data), src_step, reinterpret_cast<float*>(dst_data), dst_step, width, dcn, swapBlue, isLab, srgb);
+        return color::invoke(width, height, {LabtoBGR::cvtLabtoBGR<float>}, reinterpret_cast<const float*>(src_data), src_step, reinterpret_cast<float*>(dst_data), dst_step, width, dcn, swapBlue, isLab, srgb);
    }

    return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
-} // cv::cv_hal_rvv::LabtoBGR

 namespace BGRtoLab {
-#undef cv_hal_cvtBGRtoLab
-#define cv_hal_cvtBGRtoLab cv::cv_hal_rvv::BGRtoLab::cvtBGRtoLab

 struct rvv_base
 {
@ -3060,31 +3034,126 @@ static inline int cvtBGRtoLab_f(int start, int end, const float * src, size_t sr
    return CV_HAL_ERROR_OK;
 }

-inline int cvtBGRtoLab(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue, bool isLab, bool srgb)
+} // BGRtoLab
+
+int cvtBGRtoLab(const uchar * src_data, size_t src_step, uchar * dst_data, size_t dst_step, int width, int height, int depth, int scn, bool swapBlue, bool isLab, bool srgb)
 {
    if (scn != 3 && scn != 4)
        return CV_HAL_ERROR_NOT_IMPLEMENTED;

-    auto cvtBGRtoLab_b = cvtBGRtoLab_u<true, true>;
+    auto cvtBGRtoLab_b = BGRtoLab::cvtBGRtoLab_u<true, true>;
    if (!isLab && !srgb)
-        cvtBGRtoLab_b = cvtBGRtoLab_u<false, false>;
+        cvtBGRtoLab_b = BGRtoLab::cvtBGRtoLab_u<false, false>;
    else if (!isLab && srgb)
-        cvtBGRtoLab_b = cvtBGRtoLab_u<false, true>;
+        cvtBGRtoLab_b = BGRtoLab::cvtBGRtoLab_u<false, true>;
    else if (isLab && !srgb)
-        cvtBGRtoLab_b = cvtBGRtoLab_u<true, false>;
+        cvtBGRtoLab_b = BGRtoLab::cvtBGRtoLab_u<true, false>;

    switch (depth)
    {
    case CV_8U:
        return color::invoke(width, height, {cvtBGRtoLab_b}, reinterpret_cast<const uchar*>(src_data), src_step, reinterpret_cast<uchar*>(dst_data), dst_step, width, scn, swapBlue);
    case CV_32F:
-        return color::invoke(width, height, {cvtBGRtoLab_f}, reinterpret_cast<const float*>(src_data), src_step, reinterpret_cast<float*>(dst_data), dst_step, width, scn, swapBlue, isLab, srgb);
+        return color::invoke(width, height, {BGRtoLab::cvtBGRtoLab_f}, reinterpret_cast<const float*>(src_data), src_step, reinterpret_cast<float*>(dst_data), dst_step, width, scn, swapBlue, isLab, srgb);
    }

    return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
-} // cv::cv_hal_rvv::BGRtoLab

-}}
+#endif // CV_HAL_RVV_1P0_ENABLED

-#endif
+#if CV_HAL_RVV_071_ENABLED
+
+static const unsigned char index_array_32 [32]
+                        { 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15, 18, 17, 16, 19, 22, 21, 20, 23, 26, 25, 24, 27, 30, 29, 28, 31  };
+
+static const unsigned char index_array_24 [24]
+                        { 2, 1, 0, 5, 4, 3, 8, 7, 6, 11, 10, 9, 14, 13, 12, 17, 16, 15, 20, 19, 18, 23, 22, 21  };
+
+static void vBGRtoBGR(const unsigned char* src, unsigned char * dst, const unsigned char * index, int n, int scn, int dcn, int vsize_pixels, const int vsize)
+{
+    vuint8m2_t vec_index = vle8_v_u8m2(index, vsize);
+
+    int i = 0;
+
+    for ( ; i <= n-vsize; i += vsize_pixels, src += vsize, dst += vsize)
+    {
+        vuint8m2_t vec_src = vle8_v_u8m2(src, vsize);
+        vuint8m2_t vec_dst = vrgather_vv_u8m2(vec_src, vec_index, vsize);
+        vse8_v_u8m2(dst, vec_dst, vsize);
+    }
+
+    for ( ; i < n; i++, src += scn, dst += dcn )
+    {
+        unsigned char t0 = src[0], t1 = src[1], t2 = src[2];
+        dst[2] = t0;
+        dst[1] = t1;
+        dst[0] = t2;
+        if(dcn == 4)
+        {
+            unsigned char d = src[3];
+            dst[3] = d;
+        }
+    }
+}
+
+static void sBGRtoBGR(const unsigned char* src, unsigned char * dst, int n, int scn, int dcn, int bi)
+{
+    for (int i = 0; i < n; i++, src += scn, dst += dcn)
+    {
+        unsigned char t0 = src[0], t1 = src[1], t2 = src[2];
+        dst[bi  ] = t0;
+        dst[1]    = t1;
+        dst[bi^2] = t2;
+        if(dcn == 4)
+        {
+            unsigned char d = scn == 4 ? src[3] : std::numeric_limits<unsigned char>::max();
+            dst[3] = d;
+        }
+    }
+}
+
+int cvtBGRtoBGR(const unsigned char * src_data, size_t src_step, unsigned char * dst_data, size_t dst_step, int width, int height, int depth, int scn, int dcn, bool swapBlue)
+{
+    if (depth != CV_8U)
+    {
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    }
+
+    const int blueIdx = swapBlue ? 2 : 0;
+    if (scn == dcn)
+    {
+        if (!swapBlue)
+        {
+            return CV_HAL_ERROR_NOT_IMPLEMENTED;
+        }
+
+        const int vsize_pixels = 8;
+
+        if (scn == 4)
+        {
+            for (int i = 0; i < height; i++, src_data += src_step, dst_data += dst_step)
+            {
+                vBGRtoBGR(src_data, dst_data, index_array_32, width, scn, dcn, vsize_pixels, 32);
+            }
+        }
+        else
+        {
+            for (int i = 0; i < height; i++, src_data += src_step, dst_data += dst_step)
+            {
+                vBGRtoBGR(src_data, dst_data, index_array_24, width, scn, dcn, vsize_pixels, 24);
+            }
+        }
+    }
+    else
+    {
+        for (int i = 0; i < height; i++, src_data += src_step, dst_data += dst_step)
+            sBGRtoBGR(src_data, dst_data, width, scn, dcn, blueIdx);
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+#endif // CV_HAL_RVV_071_ENABLED
+
+}}} // cv::rvv_hal::imgproc
--- a/hal/riscv-rvv/src/imgproc/common.hpp
+++ b/hal/riscv-rvv/src/imgproc/common.hpp
@ -0,0 +1,76 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2025, SpaceMIT Inc., all rights reserved.
+// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
+// Third party copyrights are property of their respective owners.
+
+#ifndef OPENCV_HAL_RVV_IMGPROC_COMMON_HPP_INCLUDED
+#define OPENCV_HAL_RVV_IMGPROC_COMMON_HPP_INCLUDED
+
+#include "opencv2/core/hal/interface.h"
+#include "opencv2/imgproc/hal/interface.h"
+
+namespace cv { namespace rvv_hal { namespace imgproc { namespace common {
+
+inline int borderInterpolate( int p, int len, int borderType )
+{
+    if ((unsigned)p < (unsigned)len)
+        ;
+    else if (borderType == CV_HAL_BORDER_REPLICATE)
+        p = p < 0 ? 0 : len - 1;
+    else if (borderType == CV_HAL_BORDER_REFLECT || borderType == CV_HAL_BORDER_REFLECT_101)
+    {
+        int delta = borderType == CV_HAL_BORDER_REFLECT_101;
+        if (len == 1)
+            return 0;
+        do
+        {
+            if (p < 0)
+                p = -p - 1 + delta;
+            else
+                p = len - 1 - (p - len) - delta;
+        }
+        while( (unsigned)p >= (unsigned)len );
+    }
+    else if (borderType == CV_HAL_BORDER_WRAP)
+    {
+        if (p < 0)
+            p -= ((p-len+1)/len)*len;
+        if (p >= len)
+            p %= len;
+    }
+    else if (borderType == CV_HAL_BORDER_CONSTANT)
+        p = -1;
+    return p;
+}
+
+class FilterInvoker : public ParallelLoopBody
+{
+public:
+    template<typename... Args>
+    FilterInvoker(std::function<int(int, int, Args...)> _func, Args&&... args)
+    {
+        func = std::bind(_func, std::placeholders::_1, std::placeholders::_2, std::forward<Args>(args)...);
+    }
+
+    virtual void operator()(const Range& range) const override
+    {
+        func(range.start, range.end);
+    }
+
+private:
+    std::function<int(int, int)> func;
+};
+
+template<typename... Args>
+inline int invoke(int height, std::function<int(int, int, Args...)> func, Args&&... args)
+{
+    cv::parallel_for_(Range(1, height), FilterInvoker(func, std::forward<Args>(args)...), cv::getNumThreads());
+    return func(0, 1, std::forward<Args>(args)...);
+}
+
+}}}} // cv::rvv_hal::imgproc::common
+
+#endif // OPENCV_HAL_RVV_IMGPROC_COMMON_HPP_INCLUDED
--- a/hal/riscv-rvv/src/imgproc/filter.cpp
+++ b/hal/riscv-rvv/src/imgproc/filter.cpp
@ -0,0 +1,264 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
+
+#include "rvv_hal.hpp"
+#include "common.hpp"
+
+namespace cv { namespace rvv_hal { namespace imgproc {
+
+#if CV_HAL_RVV_1P0_ENABLED
+
+namespace {
+
+struct Filter2D
+{
+    const uchar* kernel_data;
+    size_t kernel_step;
+    int kernel_type;
+    int kernel_width;
+    int kernel_height;
+    int src_type;
+    int dst_type;
+    int borderType;
+    double delta;
+    int anchor_x;
+    int anchor_y;
+};
+
+static void process3(int anchor, int left, int right, float delta, const float* kernel, const uchar* row0, const uchar* row1, const uchar* row2, uchar* dst)
+{
+    int vl;
+    for (int i = left; i < right; i += vl)
+    {
+        vl = __riscv_vsetvl_e8m1(right - i);
+        auto s0 = __riscv_vfmv_v_f_f32m4(delta, vl);
+        auto s1 = __riscv_vfmv_v_f_f32m4(delta, vl);
+        auto s2 = __riscv_vfmv_v_f_f32m4(delta, vl);
+        auto s3 = __riscv_vfmv_v_f_f32m4(delta, vl);
+
+        auto addshift = [&](vfloat32m4_t a, vfloat32m4_t b, float k0, float k1, float k2, float r1, float r2) {
+            a = __riscv_vfmacc(a, k0, b, vl);
+            b = __riscv_vfslide1down(b, r1, vl);
+            a = __riscv_vfmacc(a, k1, b, vl);
+            b = __riscv_vfslide1down(b, r2, vl);
+            return __riscv_vfmacc(a, k2, b, vl);
+        };
+        auto loadsrc = [&](const uchar* row, float k0, float k1, float k2) {
+            if (!row) return;
+
+            const uchar* extra = row + (i - anchor) * 4;
+            auto src = __riscv_vlseg4e8_v_u8m1x4(extra, vl);
+            auto v0 = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vget_v_u8m1x4_u8m1(src, 0), vl), vl);
+            auto v1 = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vget_v_u8m1x4_u8m1(src, 1), vl), vl);
+            auto v2 = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vget_v_u8m1x4_u8m1(src, 2), vl), vl);
+            auto v3 = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vget_v_u8m1x4_u8m1(src, 3), vl), vl);
+
+            extra += vl * 4;
+            s0 = addshift(s0, v0, k0, k1, k2, extra[0], extra[4]);
+            s1 = addshift(s1, v1, k0, k1, k2, extra[1], extra[5]);
+            s2 = addshift(s2, v2, k0, k1, k2, extra[2], extra[6]);
+            s3 = addshift(s3, v3, k0, k1, k2, extra[3], extra[7]);
+        };
+
+        loadsrc(row0, kernel[0], kernel[1], kernel[2]);
+        loadsrc(row1, kernel[3], kernel[4], kernel[5]);
+        loadsrc(row2, kernel[6], kernel[7], kernel[8]);
+        vuint8m1x4_t val{};
+        val = __riscv_vset_v_u8m1_u8m1x4(val, 0, __riscv_vnclipu(__riscv_vfncvt_xu(s0, vl), 0, __RISCV_VXRM_RNU, vl));
+        val = __riscv_vset_v_u8m1_u8m1x4(val, 1, __riscv_vnclipu(__riscv_vfncvt_xu(s1, vl), 0, __RISCV_VXRM_RNU, vl));
+        val = __riscv_vset_v_u8m1_u8m1x4(val, 2, __riscv_vnclipu(__riscv_vfncvt_xu(s2, vl), 0, __RISCV_VXRM_RNU, vl));
+        val = __riscv_vset_v_u8m1_u8m1x4(val, 3, __riscv_vnclipu(__riscv_vfncvt_xu(s3, vl), 0, __RISCV_VXRM_RNU, vl));
+        __riscv_vsseg4e8(dst + i * 4, val, vl);
+    }
+}
+
+static void process5(int anchor, int left, int right, float delta, const float* kernel, const uchar* row0, const uchar* row1, const uchar* row2, const uchar* row3, const uchar* row4, uchar* dst)
+{
+    int vl;
+    for (int i = left; i < right; i += vl)
+    {
+        vl = __riscv_vsetvl_e8m1(right - i);
+        auto s0 = __riscv_vfmv_v_f_f32m4(delta, vl);
+        auto s1 = __riscv_vfmv_v_f_f32m4(delta, vl);
+        auto s2 = __riscv_vfmv_v_f_f32m4(delta, vl);
+        auto s3 = __riscv_vfmv_v_f_f32m4(delta, vl);
+
+        auto addshift = [&](vfloat32m4_t a, vfloat32m4_t b, float k0, float k1, float k2, float k3, float k4, float r1, float r2, float r3, float r4) {
+            a = __riscv_vfmacc(a, k0, b, vl);
+            b = __riscv_vfslide1down(b, r1, vl);
+            a = __riscv_vfmacc(a, k1, b, vl);
+            b = __riscv_vfslide1down(b, r2, vl);
+            a = __riscv_vfmacc(a, k2, b, vl);
+            b = __riscv_vfslide1down(b, r3, vl);
+            a = __riscv_vfmacc(a, k3, b, vl);
+            b = __riscv_vfslide1down(b, r4, vl);
+            return __riscv_vfmacc(a, k4, b, vl);
+        };
+        auto loadsrc = [&](const uchar* row, float k0, float k1, float k2, float k3, float k4) {
+            if (!row) return;
+
+            const uchar* extra = row + (i - anchor) * 4;
+            auto src = __riscv_vlseg4e8_v_u8m1x4(extra, vl);
+            auto v0 = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vget_v_u8m1x4_u8m1(src, 0), vl), vl);
+            auto v1 = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vget_v_u8m1x4_u8m1(src, 1), vl), vl);
+            auto v2 = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vget_v_u8m1x4_u8m1(src, 2), vl), vl);
+            auto v3 = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vget_v_u8m1x4_u8m1(src, 3), vl), vl);
+
+            extra += vl * 4;
+            s0 = addshift(s0, v0, k0, k1, k2, k3, k4, extra[0], extra[4], extra[ 8], extra[12]);
+            s1 = addshift(s1, v1, k0, k1, k2, k3, k4, extra[1], extra[5], extra[ 9], extra[13]);
+            s2 = addshift(s2, v2, k0, k1, k2, k3, k4, extra[2], extra[6], extra[10], extra[14]);
+            s3 = addshift(s3, v3, k0, k1, k2, k3, k4, extra[3], extra[7], extra[11], extra[15]);
+        };
+
+        loadsrc(row0, kernel[ 0], kernel[ 1], kernel[ 2], kernel[ 3], kernel[ 4]);
+        loadsrc(row1, kernel[ 5], kernel[ 6], kernel[ 7], kernel[ 8], kernel[ 9]);
+        loadsrc(row2, kernel[10], kernel[11], kernel[12], kernel[13], kernel[14]);
+        loadsrc(row3, kernel[15], kernel[16], kernel[17], kernel[18], kernel[19]);
+        loadsrc(row4, kernel[20], kernel[21], kernel[22], kernel[23], kernel[24]);
+        vuint8m1x4_t val{};
+        val = __riscv_vset_v_u8m1_u8m1x4(val, 0, __riscv_vnclipu(__riscv_vfncvt_xu(s0, vl), 0, __RISCV_VXRM_RNU, vl));
+        val = __riscv_vset_v_u8m1_u8m1x4(val, 1, __riscv_vnclipu(__riscv_vfncvt_xu(s1, vl), 0, __RISCV_VXRM_RNU, vl));
+        val = __riscv_vset_v_u8m1_u8m1x4(val, 2, __riscv_vnclipu(__riscv_vfncvt_xu(s2, vl), 0, __RISCV_VXRM_RNU, vl));
+        val = __riscv_vset_v_u8m1_u8m1x4(val, 3, __riscv_vnclipu(__riscv_vfncvt_xu(s3, vl), 0, __RISCV_VXRM_RNU, vl));
+        __riscv_vsseg4e8(dst + i * 4, val, vl);
+    }
+}
+
+// the algorithm is copied from 3rdparty/carotene/src/convolution.cpp,
+// in the function void CAROTENE_NS::convolution
+template<int ksize>
+static inline int filter(int start, int end, Filter2D* data, const uchar* src_data, size_t src_step, uchar* dst_data, int width, int height, int full_width, int full_height, int offset_x, int offset_y)
+{
+    float kernel[ksize * ksize];
+    for (int i = 0; i < ksize * ksize; i++)
+    {
+        kernel[i] = reinterpret_cast<const float*>(data->kernel_data + (i / ksize) * data->kernel_step)[i % ksize];
+    }
+
+    constexpr int noval = std::numeric_limits<int>::max();
+    auto access = [&](int x, int y) {
+        int pi, pj;
+        if (data->borderType & BORDER_ISOLATED)
+        {
+            pi = common::borderInterpolate(x - data->anchor_y, height, data->borderType & ~BORDER_ISOLATED);
+            pj = common::borderInterpolate(y - data->anchor_x, width , data->borderType & ~BORDER_ISOLATED);
+            pi = pi < 0 ? noval : pi;
+            pj = pj < 0 ? noval : pj;
+        }
+        else
+        {
+            pi = common::borderInterpolate(offset_y + x - data->anchor_y, full_height, data->borderType);
+            pj = common::borderInterpolate(offset_x + y - data->anchor_x, full_width , data->borderType);
+            pi = pi < 0 ? noval : pi - offset_y;
+            pj = pj < 0 ? noval : pj - offset_x;
+        }
+        return std::make_pair(pi, pj);
+    };
+
+    auto process = [&](int x, int y) {
+        float sum0, sum1, sum2, sum3;
+        sum0 = sum1 = sum2 = sum3 = data->delta;
+        for (int i = 0; i < ksize * ksize; i++)
+        {
+            auto p = access(x + i / ksize, y + i % ksize);
+            if (p.first != noval && p.second != noval)
+            {
+                sum0 += kernel[i] * src_data[p.first * src_step + p.second * 4    ];
+                sum1 += kernel[i] * src_data[p.first * src_step + p.second * 4 + 1];
+                sum2 += kernel[i] * src_data[p.first * src_step + p.second * 4 + 2];
+                sum3 += kernel[i] * src_data[p.first * src_step + p.second * 4 + 3];
+            }
+        }
+        dst_data[(x * width + y) * 4    ] = std::max(0, std::min((int)std::round(sum0), (int)std::numeric_limits<uchar>::max()));
+        dst_data[(x * width + y) * 4 + 1] = std::max(0, std::min((int)std::round(sum1), (int)std::numeric_limits<uchar>::max()));
+        dst_data[(x * width + y) * 4 + 2] = std::max(0, std::min((int)std::round(sum2), (int)std::numeric_limits<uchar>::max()));
+        dst_data[(x * width + y) * 4 + 3] = std::max(0, std::min((int)std::round(sum3), (int)std::numeric_limits<uchar>::max()));
+    };
+
+    const int left = data->anchor_x, right = width - (ksize - 1 - data->anchor_x);
+    for (int i = start; i < end; i++)
+    {
+        if (left >= right)
+        {
+            for (int j = 0; j < width; j++)
+                process(i, j);
+        }
+        else
+        {
+            for (int j = 0; j < left; j++)
+                process(i, j);
+            for (int j = right; j < width; j++)
+                process(i, j);
+
+            const uchar* row0 = access(i    , 0).first == noval ? nullptr : src_data + access(i    , 0).first * src_step;
+            const uchar* row1 = access(i + 1, 0).first == noval ? nullptr : src_data + access(i + 1, 0).first * src_step;
+            const uchar* row2 = access(i + 2, 0).first == noval ? nullptr : src_data + access(i + 2, 0).first * src_step;
+            if (ksize == 3)
+            {
+                process3(data->anchor_x, left, right, data->delta, kernel, row0, row1, row2, dst_data + i * width * 4);
+            }
+            else
+            {
+                const uchar* row3 = access(i + 3, 0).first == noval ? nullptr : src_data + access(i + 3, 0).first * src_step;
+                const uchar* row4 = access(i + 4, 0).first == noval ? nullptr : src_data + access(i + 4, 0).first * src_step;
+                process5(data->anchor_x, left, right, data->delta, kernel, row0, row1, row2, row3, row4, dst_data + i * width * 4);
+            }
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+} // anonymous
+
+int filterInit(cvhalFilter2D** context, uchar* kernel_data, size_t kernel_step, int kernel_type, int kernel_width, int kernel_height, int /*max_width*/, int /*max_height*/, int src_type, int dst_type, int borderType, double delta, int anchor_x, int anchor_y, bool /*allowSubmatrix*/, bool /*allowInplace*/)
+{
+    if (kernel_type != CV_32FC1 || src_type != CV_8UC4 || dst_type != CV_8UC4)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    if (kernel_width != kernel_height)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    if (kernel_width != 3 && kernel_width != 5)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    if ((borderType & ~BORDER_ISOLATED) == BORDER_WRAP)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    anchor_x = anchor_x < 0 ? kernel_width  / 2 : anchor_x;
+    anchor_y = anchor_y < 0 ? kernel_height / 2 : anchor_y;
+    *context = reinterpret_cast<cvhalFilter2D*>(new Filter2D{kernel_data, kernel_step, kernel_type, kernel_width, kernel_height, src_type, dst_type, borderType, delta, anchor_x, anchor_y});
+    return CV_HAL_ERROR_OK;
+}
+
+int filter(cvhalFilter2D* context, uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int full_width, int full_height, int offset_x, int offset_y)
+{
+    Filter2D* data = reinterpret_cast<Filter2D*>(context);
+    std::vector<uchar> dst(width * height * 4);
+
+    int res = CV_HAL_ERROR_NOT_IMPLEMENTED;
+    switch (data->kernel_width)
+    {
+    case 3:
+        res = common::invoke(height, {filter<3>}, data, src_data, src_step, dst.data(), width, height, full_width, full_height, offset_x, offset_y);
+        break;
+    case 5:
+        res = common::invoke(height, {filter<5>}, data, src_data, src_step, dst.data(), width, height, full_width, full_height, offset_x, offset_y);
+        break;
+    }
+
+    for (int i = 0; i < height; i++)
+        memcpy(dst_data + i * dst_step, dst.data() + i * width * 4, width * 4);
+    return res;
+}
+
+int filterFree(cvhalFilter2D* context)
+{
+    delete reinterpret_cast<Filter2D*>(context);
+    return CV_HAL_ERROR_OK;
+}
+
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+}}} // cv::rvv_hal::imgproc
--- a/hal/riscv-rvv/src/imgproc/gaussian_blur.cpp
+++ b/hal/riscv-rvv/src/imgproc/gaussian_blur.cpp
@ -0,0 +1,389 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
+
+#include "rvv_hal.hpp"
+#include "common.hpp"
+
+namespace cv { namespace rvv_hal { namespace imgproc {
+
+#if CV_HAL_RVV_1P0_ENABLED
+
+namespace {
+
+// the algorithm is same as cv_hal_sepFilter
+template<int ksize, typename helperT, typename helperWT>
+static inline int gaussianBlurC1(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int full_width, int full_height, int offset_x, int offset_y, int border_type)
+{
+    using T = typename helperT::ElemType;
+    using WT = typename helperWT::ElemType;
+
+    constexpr int noval = std::numeric_limits<int>::max();
+    auto accessX = [&](int x) {
+        int pi = common::borderInterpolate(offset_y + x - ksize / 2, full_height, border_type); // [TODO] fix dependencies
+        return pi < 0 ? noval : pi - offset_y;
+    };
+    auto accessY = [&](int y) {
+        int pj = common::borderInterpolate(offset_x + y - ksize / 2, full_width, border_type);
+        return pj < 0 ? noval : pj - offset_x;
+    };
+    auto p2idx = [&](int x, int y){ return (x + ksize) % ksize * width + y; };
+
+    constexpr uint kernel[2][5] = {{1, 2, 1}, {1, 4, 6, 4, 1}};
+    std::vector<WT> res(width * ksize);
+    auto process = [&](int x, int y) {
+        WT sum = 0;
+        for (int i = 0; i < ksize; i++)
+        {
+            int p = accessY(y + i);
+            if (p != noval)
+            {
+                sum += kernel[ksize == 5][i] * static_cast<WT>(reinterpret_cast<const T*>(src_data + x * src_step)[p]);
+            }
+        }
+        res[p2idx(x, y)] = sum;
+    };
+
+    const int left = ksize / 2, right = width - ksize / 2;
+    for (int i = start - ksize / 2; i < end + ksize / 2; i++)
+    {
+        if (i + offset_y >= 0 && i + offset_y < full_height)
+        {
+            if (left >= right)
+            {
+                for (int j = 0; j < width; j++)
+                    process(i, j);
+            }
+            else
+            {
+                for (int j = 0; j < left; j++)
+                    process(i, j);
+                for (int j = right; j < width; j++)
+                    process(i, j);
+
+                int vl;
+                for (int j = left; j < right; j += vl)
+                {
+                    vl = helperT::setvl(right - j);
+                    const T* extra = reinterpret_cast<const T*>(src_data + i * src_step) + j - ksize / 2;
+                    auto src = __riscv_vzext_vf2(helperT::vload(extra, vl), vl);
+
+                    extra += vl;
+                    auto sum = src;
+                    if (ksize == 3)
+                    {
+                        src = __riscv_vslide1down(src, extra[0], vl);
+                        sum = __riscv_vadd(sum, __riscv_vsll(src, 1, vl), vl);
+                        src = __riscv_vslide1down(src, extra[1], vl);
+                        sum = __riscv_vadd(sum, src, vl);
+                    }
+                    else
+                    {
+                        src = __riscv_vslide1down(src, extra[0], vl);
+                        sum = __riscv_vadd(sum, __riscv_vsll(src, 2, vl), vl);
+                        src = __riscv_vslide1down(src, extra[1], vl);
+                        sum = __riscv_vadd(sum, __riscv_vadd(__riscv_vsll(src, 1, vl), __riscv_vsll(src, 2, vl), vl), vl);
+                        src = __riscv_vslide1down(src, extra[2], vl);
+                        sum = __riscv_vadd(sum, __riscv_vsll(src, 2, vl), vl);
+                        src = __riscv_vslide1down(src, extra[3], vl);
+                        sum = __riscv_vadd(sum, src, vl);
+                    }
+                    helperWT::vstore(res.data() + p2idx(i, j), sum, vl);
+                }
+            }
+        }
+
+        int cur = i - ksize / 2;
+        if (cur >= start)
+        {
+            const WT* row0 = accessX(cur    ) == noval ? nullptr : res.data() + p2idx(accessX(cur    ), 0);
+            const WT* row1 = accessX(cur + 1) == noval ? nullptr : res.data() + p2idx(accessX(cur + 1), 0);
+            const WT* row2 = accessX(cur + 2) == noval ? nullptr : res.data() + p2idx(accessX(cur + 2), 0);
+            const WT* row3 = nullptr, *row4 = nullptr;
+            if (ksize == 5)
+            {
+                row3 = accessX(cur + 3) == noval ? nullptr : res.data() + p2idx(accessX(cur + 3), 0);
+                row4 = accessX(cur + 4) == noval ? nullptr : res.data() + p2idx(accessX(cur + 4), 0);
+            }
+
+            int vl;
+            for (int j = 0; j < width; j += vl)
+            {
+                vl = helperWT::setvl(width - j);
+                auto v0 = row0 ? helperWT::vload(row0 + j, vl) : helperWT::vmv(0, vl);
+                auto v1 = row1 ? helperWT::vload(row1 + j, vl) : helperWT::vmv(0, vl);
+                auto v2 = row2 ? helperWT::vload(row2 + j, vl) : helperWT::vmv(0, vl);
+                typename helperWT::VecType sum;
+                if (ksize == 3)
+                {
+                    sum = __riscv_vadd(__riscv_vadd(v0, v2, vl), __riscv_vsll(v1, 1, vl), vl);
+                }
+                else
+                {
+                    sum = __riscv_vadd(v0, __riscv_vadd(__riscv_vsll(v2, 1, vl), __riscv_vsll(v2, 2, vl), vl), vl);
+                    auto v3 = row3 ? helperWT::vload(row3 + j, vl) : helperWT::vmv(0, vl);
+                    sum = __riscv_vadd(sum, __riscv_vsll(__riscv_vadd(v1, v3, vl), 2, vl), vl);
+                    auto v4 = row4 ? helperWT::vload(row4 + j, vl) : helperWT::vmv(0, vl);
+                    sum = __riscv_vadd(sum, v4, vl);
+                }
+                helperT::vstore(reinterpret_cast<T*>(dst_data + cur * dst_step) + j, __riscv_vnclipu(sum, ksize == 5 ? 8 : 4, __RISCV_VXRM_RNU, vl), vl);
+            }
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+template<int ksize>
+static inline int gaussianBlurC4(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int full_width, int full_height, int offset_x, int offset_y, int border_type)
+{
+    constexpr int noval = std::numeric_limits<int>::max();
+    auto accessX = [&](int x) {
+        int pi = common::borderInterpolate(offset_y + x - ksize / 2, full_height, border_type);
+        return pi < 0 ? noval : pi - offset_y;
+    };
+    auto accessY = [&](int y) {
+        int pj = common::borderInterpolate(offset_x + y - ksize / 2, full_width, border_type);
+        return pj < 0 ? noval : pj - offset_x;
+    };
+    auto p2idx = [&](int x, int y){ return ((x + ksize) % ksize * width + y) * 4; };
+
+    constexpr uint kernel[2][5] = {{1, 2, 1}, {1, 4, 6, 4, 1}};
+    std::vector<ushort> res(width * ksize * 4);
+    auto process = [&](int x, int y) {
+        ushort sum0, sum1, sum2, sum3;
+        sum0 = sum1 = sum2 = sum3 = 0;
+        for (int i = 0; i < ksize; i++)
+        {
+            int p = accessY(y + i);
+            if (p != noval)
+            {
+                sum0 += kernel[ksize == 5][i] * static_cast<ushort>((src_data + x * src_step)[p * 4    ]);
+                sum1 += kernel[ksize == 5][i] * static_cast<ushort>((src_data + x * src_step)[p * 4 + 1]);
+                sum2 += kernel[ksize == 5][i] * static_cast<ushort>((src_data + x * src_step)[p * 4 + 2]);
+                sum3 += kernel[ksize == 5][i] * static_cast<ushort>((src_data + x * src_step)[p * 4 + 3]);
+            }
+        }
+        res[p2idx(x, y)    ] = sum0;
+        res[p2idx(x, y) + 1] = sum1;
+        res[p2idx(x, y) + 2] = sum2;
+        res[p2idx(x, y) + 3] = sum3;
+    };
+
+    const int left = ksize / 2, right = width - ksize / 2;
+    for (int i = start - ksize / 2; i < end + ksize / 2; i++)
+    {
+        if (i + offset_y >= 0 && i + offset_y < full_height)
+        {
+            if (left >= right)
+            {
+                for (int j = 0; j < width; j++)
+                    process(i, j);
+            }
+            else
+            {
+                for (int j = 0; j < left; j++)
+                    process(i, j);
+                for (int j = right; j < width; j++)
+                    process(i, j);
+
+                int vl;
+                for (int j = left; j < right; j += vl)
+                {
+                    vl = __riscv_vsetvl_e8m1(right - j);
+                    const uchar* extra = src_data + i * src_step + (j - ksize / 2) * 4;
+                    auto src = __riscv_vlseg4e8_v_u8m1x4(extra, vl);
+                    auto src0 = __riscv_vzext_vf2(__riscv_vget_v_u8m1x4_u8m1(src, 0), vl);
+                    auto src1 = __riscv_vzext_vf2(__riscv_vget_v_u8m1x4_u8m1(src, 1), vl);
+                    auto src2 = __riscv_vzext_vf2(__riscv_vget_v_u8m1x4_u8m1(src, 2), vl);
+                    auto src3 = __riscv_vzext_vf2(__riscv_vget_v_u8m1x4_u8m1(src, 3), vl);
+
+                    extra += vl * 4;
+                    auto sum0 = src0, sum1 = src1, sum2 = src2, sum3 = src3;
+                    if (ksize == 3)
+                    {
+                        src0 = __riscv_vslide1down(src0, extra[0], vl);
+                        src1 = __riscv_vslide1down(src1, extra[1], vl);
+                        src2 = __riscv_vslide1down(src2, extra[2], vl);
+                        src3 = __riscv_vslide1down(src3, extra[3], vl);
+                        sum0 = __riscv_vadd(sum0, __riscv_vsll(src0, 1, vl), vl);
+                        sum1 = __riscv_vadd(sum1, __riscv_vsll(src1, 1, vl), vl);
+                        sum2 = __riscv_vadd(sum2, __riscv_vsll(src2, 1, vl), vl);
+                        sum3 = __riscv_vadd(sum3, __riscv_vsll(src3, 1, vl), vl);
+                        src0 = __riscv_vslide1down(src0, extra[4], vl);
+                        src1 = __riscv_vslide1down(src1, extra[5], vl);
+                        src2 = __riscv_vslide1down(src2, extra[6], vl);
+                        src3 = __riscv_vslide1down(src3, extra[7], vl);
+                        sum0 = __riscv_vadd(sum0, src0, vl);
+                        sum1 = __riscv_vadd(sum1, src1, vl);
+                        sum2 = __riscv_vadd(sum2, src2, vl);
+                        sum3 = __riscv_vadd(sum3, src3, vl);
+                    }
+                    else
+                    {
+                        src0 = __riscv_vslide1down(src0, extra[0], vl);
+                        src1 = __riscv_vslide1down(src1, extra[1], vl);
+                        src2 = __riscv_vslide1down(src2, extra[2], vl);
+                        src3 = __riscv_vslide1down(src3, extra[3], vl);
+                        sum0 = __riscv_vadd(sum0, __riscv_vsll(src0, 2, vl), vl);
+                        sum1 = __riscv_vadd(sum1, __riscv_vsll(src1, 2, vl), vl);
+                        sum2 = __riscv_vadd(sum2, __riscv_vsll(src2, 2, vl), vl);
+                        sum3 = __riscv_vadd(sum3, __riscv_vsll(src3, 2, vl), vl);
+                        src0 = __riscv_vslide1down(src0, extra[4], vl);
+                        src1 = __riscv_vslide1down(src1, extra[5], vl);
+                        src2 = __riscv_vslide1down(src2, extra[6], vl);
+                        src3 = __riscv_vslide1down(src3, extra[7], vl);
+                        sum0 = __riscv_vadd(sum0, __riscv_vadd(__riscv_vsll(src0, 1, vl), __riscv_vsll(src0, 2, vl), vl), vl);
+                        sum1 = __riscv_vadd(sum1, __riscv_vadd(__riscv_vsll(src1, 1, vl), __riscv_vsll(src1, 2, vl), vl), vl);
+                        sum2 = __riscv_vadd(sum2, __riscv_vadd(__riscv_vsll(src2, 1, vl), __riscv_vsll(src2, 2, vl), vl), vl);
+                        sum3 = __riscv_vadd(sum3, __riscv_vadd(__riscv_vsll(src3, 1, vl), __riscv_vsll(src3, 2, vl), vl), vl);
+                        src0 = __riscv_vslide1down(src0, extra[ 8], vl);
+                        src1 = __riscv_vslide1down(src1, extra[ 9], vl);
+                        src2 = __riscv_vslide1down(src2, extra[10], vl);
+                        src3 = __riscv_vslide1down(src3, extra[11], vl);
+                        sum0 = __riscv_vadd(sum0, __riscv_vsll(src0, 2, vl), vl);
+                        sum1 = __riscv_vadd(sum1, __riscv_vsll(src1, 2, vl), vl);
+                        sum2 = __riscv_vadd(sum2, __riscv_vsll(src2, 2, vl), vl);
+                        sum3 = __riscv_vadd(sum3, __riscv_vsll(src3, 2, vl), vl);
+                        src0 = __riscv_vslide1down(src0, extra[12], vl);
+                        src1 = __riscv_vslide1down(src1, extra[13], vl);
+                        src2 = __riscv_vslide1down(src2, extra[14], vl);
+                        src3 = __riscv_vslide1down(src3, extra[15], vl);
+                        sum0 = __riscv_vadd(sum0, src0, vl);
+                        sum1 = __riscv_vadd(sum1, src1, vl);
+                        sum2 = __riscv_vadd(sum2, src2, vl);
+                        sum3 = __riscv_vadd(sum3, src3, vl);
+                    }
+
+                    vuint16m2x4_t dst{};
+                    dst = __riscv_vset_v_u16m2_u16m2x4(dst, 0, sum0);
+                    dst = __riscv_vset_v_u16m2_u16m2x4(dst, 1, sum1);
+                    dst = __riscv_vset_v_u16m2_u16m2x4(dst, 2, sum2);
+                    dst = __riscv_vset_v_u16m2_u16m2x4(dst, 3, sum3);
+                    __riscv_vsseg4e16(res.data() + p2idx(i, j), dst, vl);
+                }
+            }
+        }
+
+        int cur = i - ksize / 2;
+        if (cur >= start)
+        {
+            const ushort* row0 = accessX(cur    ) == noval ? nullptr : res.data() + p2idx(accessX(cur    ), 0);
+            const ushort* row1 = accessX(cur + 1) == noval ? nullptr : res.data() + p2idx(accessX(cur + 1), 0);
+            const ushort* row2 = accessX(cur + 2) == noval ? nullptr : res.data() + p2idx(accessX(cur + 2), 0);
+            const ushort* row3 = nullptr, *row4 = nullptr;
+            if (ksize == 5)
+            {
+                row3 = accessX(cur + 3) == noval ? nullptr : res.data() + p2idx(accessX(cur + 3), 0);
+                row4 = accessX(cur + 4) == noval ? nullptr : res.data() + p2idx(accessX(cur + 4), 0);
+            }
+
+            int vl;
+            for (int j = 0; j < width; j += vl)
+            {
+                vl = __riscv_vsetvl_e16m2(width - j);
+                vuint16m2_t sum0, sum1, sum2, sum3, src0{}, src1{}, src2{}, src3{};
+                sum0 = sum1 = sum2 = sum3 = __riscv_vmv_v_x_u16m2(0, vl);
+
+                auto loadres = [&](const ushort* row) {
+                    auto src = __riscv_vlseg4e16_v_u16m2x4(row + j * 4, vl);
+                    src0 = __riscv_vget_v_u16m2x4_u16m2(src, 0);
+                    src1 = __riscv_vget_v_u16m2x4_u16m2(src, 1);
+                    src2 = __riscv_vget_v_u16m2x4_u16m2(src, 2);
+                    src3 = __riscv_vget_v_u16m2x4_u16m2(src, 3);
+                };
+                if (row0)
+                {
+                    loadres(row0);
+                    sum0 = src0;
+                    sum1 = src1;
+                    sum2 = src2;
+                    sum3 = src3;
+                }
+                if (row1)
+                {
+                    loadres(row1);
+                    sum0 = __riscv_vadd(sum0, __riscv_vsll(src0, ksize == 5 ? 2 : 1, vl), vl);
+                    sum1 = __riscv_vadd(sum1, __riscv_vsll(src1, ksize == 5 ? 2 : 1, vl), vl);
+                    sum2 = __riscv_vadd(sum2, __riscv_vsll(src2, ksize == 5 ? 2 : 1, vl), vl);
+                    sum3 = __riscv_vadd(sum3, __riscv_vsll(src3, ksize == 5 ? 2 : 1, vl), vl);
+                }
+                if (row2)
+                {
+                    loadres(row2);
+                    if (ksize == 5)
+                    {
+                        src0 = __riscv_vadd(__riscv_vsll(src0, 1, vl), __riscv_vsll(src0, 2, vl), vl);
+                        src1 = __riscv_vadd(__riscv_vsll(src1, 1, vl), __riscv_vsll(src1, 2, vl), vl);
+                        src2 = __riscv_vadd(__riscv_vsll(src2, 1, vl), __riscv_vsll(src2, 2, vl), vl);
+                        src3 = __riscv_vadd(__riscv_vsll(src3, 1, vl), __riscv_vsll(src3, 2, vl), vl);
+                    }
+                    sum0 = __riscv_vadd(sum0, src0, vl);
+                    sum1 = __riscv_vadd(sum1, src1, vl);
+                    sum2 = __riscv_vadd(sum2, src2, vl);
+                    sum3 = __riscv_vadd(sum3, src3, vl);
+                }
+                if (row3)
+                {
+                    loadres(row3);
+                    sum0 = __riscv_vadd(sum0, __riscv_vsll(src0, 2, vl), vl);
+                    sum1 = __riscv_vadd(sum1, __riscv_vsll(src1, 2, vl), vl);
+                    sum2 = __riscv_vadd(sum2, __riscv_vsll(src2, 2, vl), vl);
+                    sum3 = __riscv_vadd(sum3, __riscv_vsll(src3, 2, vl), vl);
+                }
+                if (row4)
+                {
+                    loadres(row4);
+                    sum0 = __riscv_vadd(sum0, src0, vl);
+                    sum1 = __riscv_vadd(sum1, src1, vl);
+                    sum2 = __riscv_vadd(sum2, src2, vl);
+                    sum3 = __riscv_vadd(sum3, src3, vl);
+                }
+
+                vuint8m1x4_t dst{};
+                dst = __riscv_vset_v_u8m1_u8m1x4(dst, 0, __riscv_vnclipu(sum0, ksize == 5 ? 8 : 4, __RISCV_VXRM_RNU, vl));
+                dst = __riscv_vset_v_u8m1_u8m1x4(dst, 1, __riscv_vnclipu(sum1, ksize == 5 ? 8 : 4, __RISCV_VXRM_RNU, vl));
+                dst = __riscv_vset_v_u8m1_u8m1x4(dst, 2, __riscv_vnclipu(sum2, ksize == 5 ? 8 : 4, __RISCV_VXRM_RNU, vl));
+                dst = __riscv_vset_v_u8m1_u8m1x4(dst, 3, __riscv_vnclipu(sum3, ksize == 5 ? 8 : 4, __RISCV_VXRM_RNU, vl));
+                __riscv_vsseg4e8(dst_data + cur * dst_step + j * 4, dst, vl);
+            }
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+} // anonymous
+
+int gaussianBlurBinomial(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int depth, int cn, size_t margin_left, size_t margin_top, size_t margin_right, size_t margin_bottom, size_t ksize, int border_type)
+{
+    const int type = CV_MAKETYPE(depth, cn);
+    if ((type != CV_8UC1 && type != CV_8UC4 && type != CV_16UC1) || src_data == dst_data)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    if ((ksize != 3 && ksize != 5) || border_type & BORDER_ISOLATED || border_type == BORDER_WRAP)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    switch (ksize*100 + type)
+    {
+    case 300 + CV_8UC1:
+        return common::invoke(height, {gaussianBlurC1<3, RVV_U8M4, RVV_U16M8>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, border_type);
+    case 500 + CV_8UC1:
+        return common::invoke(height, {gaussianBlurC1<5, RVV_U8M4, RVV_U16M8>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, border_type);
+    case 300 + CV_16UC1:
+        return common::invoke(height, {gaussianBlurC1<3, RVV_U16M4, RVV_U32M8>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, border_type);
+    case 500 + CV_16UC1:
+        return common::invoke(height, {gaussianBlurC1<5, RVV_U16M4, RVV_U32M8>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, border_type);
+    case 300 + CV_8UC4:
+        return common::invoke(height, {gaussianBlurC4<3>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, border_type);
+    case 500 + CV_8UC4:
+        return common::invoke(height, {gaussianBlurC4<5>}, src_data, src_step, dst_data, dst_step, width, margin_left + width + margin_right, margin_top + height + margin_bottom, margin_left, margin_top, border_type);
+    }
+
+    return CV_HAL_ERROR_NOT_IMPLEMENTED;
+}
+
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+}}} // cv::rvv_hal::imgproc
--- a/hal/riscv-rvv/src/imgproc/histogram.cpp
+++ b/hal/riscv-rvv/src/imgproc/histogram.cpp
@ -0,0 +1,282 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
+// Copyright (C) 2025, SpaceMIT Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+
+#include "rvv_hal.hpp"
+#include <cstring>
+#include <vector>
+
+namespace cv { namespace rvv_hal { namespace imgproc {
+
+#if CV_HAL_RVV_1P0_ENABLED
+
+namespace {
+
+class HistogramInvoker : public ParallelLoopBody
+{
+public:
+    template<typename... Args>
+    HistogramInvoker(std::function<void(int, int, Args...)> _func, Args&&... args)
+    {
+        func = std::bind(_func, std::placeholders::_1, std::placeholders::_2, std::forward<Args>(args)...);
+    }
+
+    virtual void operator()(const Range& range) const override
+    {
+        func(range.start, range.end);
+    }
+
+private:
+    std::function<void(int, int)> func;
+};
+
+constexpr int HIST_SZ = std::numeric_limits<uchar>::max() + 1;
+
+static inline void hist_invoke(int start, int end, const uchar* src_data, size_t src_step, int width, int* hist, std::mutex* m)
+{
+    int h[HIST_SZ] = {0};
+    for (int i = start; i < end; i++)
+    {
+        const uchar* src = src_data + i * src_step;
+        int j;
+        for (j = 0; j + 3 < width; j += 4)
+        {
+            int t0 = src[j], t1 = src[j+1];
+            h[t0]++; h[t1]++;
+            t0 = src[j+2]; t1 = src[j+3];
+            h[t0]++; h[t1]++;
+        }
+        for (; j < width; j++)
+        {
+            h[src[j]]++;
+        }
+    }
+
+    std::lock_guard<std::mutex> lk(*m);
+    for (int i = 0; i < HIST_SZ; i++)
+    {
+        hist[i] += h[i];
+    }
+}
+
+static inline void lut_invoke(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, const uchar* lut)
+{
+    for (int i = start; i < end; i++)
+    {
+        int vl;
+        for (int j = 0; j < width; j += vl)
+        {
+            vl = __riscv_vsetvl_e8m8(width - j);
+            auto src = __riscv_vle8_v_u8m8(src_data + i * src_step + j, vl);
+            auto dst = __riscv_vloxei8_v_u8m8(lut, src, vl);
+            __riscv_vse8(dst_data + i * dst_step + j, dst, vl);
+        }
+    }
+}
+
+} // equalize_hist
+
+// the algorithm is copied from imgproc/src/histogram.cpp,
+// in the function void cv::equalizeHist
+int equalize_hist(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height)
+{
+    int hist[HIST_SZ] = {0};
+    uchar lut[HIST_SZ];
+
+    std::mutex m;
+    cv::parallel_for_(Range(0, height), HistogramInvoker({hist_invoke}, src_data, src_step, width, reinterpret_cast<int *>(hist), &m), static_cast<double>(width * height) / (1 << 15));
+
+    int i = 0;
+    while (!hist[i]) ++i;
+
+    float scale = (HIST_SZ - 1.f)/(width * height - hist[i]);
+    int sum = 0;
+    for (lut[i++] = 0; i < HIST_SZ; i++)
+    {
+        sum += hist[i];
+        lut[i] = std::min(std::max(static_cast<int>(std::round(sum * scale)), 0), HIST_SZ - 1);
+    }
+    cv::parallel_for_(Range(0, height), HistogramInvoker({lut_invoke}, src_data, src_step, dst_data, dst_step, width, reinterpret_cast<const uchar*>(lut)), static_cast<double>(width * height) / (1 << 15));
+
+    return CV_HAL_ERROR_OK;
+}
+
+// ############ calc_hist ############
+
+namespace {
+
+constexpr int MAX_VLEN = 1024;
+constexpr int MAX_E8M1 = MAX_VLEN / 8;
+
+inline void cvt_32s32f(const int* ihist, float* fhist, int hist_size) {
+    int vl;
+    for (int i = 0; i < hist_size; i += vl) {
+        vl = __riscv_vsetvl_e32m8(hist_size - i);
+        auto iv = __riscv_vle32_v_i32m8(ihist + i, vl);
+        __riscv_vse32(fhist + i, __riscv_vfcvt_f(iv, vl), vl);
+    }
+}
+
+inline void cvt32s32f_add32f(const int* ihist, float* fhist, int hist_size) {
+    int vl;
+    for (int i = 0; i < hist_size; i += vl) {
+        vl = __riscv_vsetvl_e32m8(hist_size - i);
+        auto iv = __riscv_vle32_v_i32m8(ihist + i, vl);
+        auto fv = __riscv_vle32_v_f32m8(fhist + i, vl);
+        auto s = __riscv_vfadd(__riscv_vfcvt_f(iv, vl), fv, vl);
+        __riscv_vse32(fhist + i, s, vl);
+    }
+}
+
+}
+
+int calc_hist(const uchar* src_data, size_t src_step, int src_type, int src_width, int src_height,
+              float* hist_data, int hist_size, const float** ranges, bool uniform, bool accumulate) {
+    int depth = CV_MAT_DEPTH(src_type), cn = CV_MAT_CN(src_type);
+
+    // [TODO] support non-uniform
+    // In case of CV_8U, it is already fast enough with lut
+    if ((depth != CV_16U && depth != CV_32F) || !uniform) {
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    }
+
+    std::vector<int> buf_ihist(hist_size+1, 0);
+    int* ihist = buf_ihist.data();
+
+    double low = ranges[0][0], high = ranges[0][1];
+    double t = hist_size / (high - low);
+    double a = t, b = -t * low;
+    double v0_lo = low, v0_hi = high;
+
+    int sz = hist_size, d0 = cn, step0 = (int)(src_step / CV_ELEM_SIZE1(src_type));
+    int buf_idx[MAX_E8M1];
+
+    if (depth == CV_16U) {
+        const ushort* p0 = (const ushort*)src_data;
+        if (d0 == 1) {
+            while (src_height--) {
+                int vl;
+                for (int x = 0; x < src_width; x += vl) {
+                    vl = __riscv_vsetvl_e16m2(src_width - x);
+
+                    auto v = __riscv_vfcvt_f(__riscv_vwcvtu_x(__riscv_vwcvtu_x(__riscv_vle16_v_u16m2(p0 + x, vl), vl), vl), vl);
+
+                    auto m0 = __riscv_vmflt(v, v0_lo, vl);
+                    auto m1 = __riscv_vmfge(v, v0_hi, vl);
+                    auto m = __riscv_vmor(m0, m1, vl);
+
+                    auto fidx = __riscv_vfadd(__riscv_vfmul(v, a, vl), b, vl);
+                    auto idx = __riscv_vfncvt_x(__riscv_vfsub(fidx, 0.5f - 1e-6, vl), vl);
+                    idx = __riscv_vmerge(idx, 0, __riscv_vmslt(idx, 0, vl), vl);
+                    idx = __riscv_vmerge(idx, sz-1, __riscv_vmsgt(idx, sz-1, vl), vl);
+                    idx = __riscv_vmerge(idx, -1, m, vl);
+                    __riscv_vse32(buf_idx, idx, vl);
+
+                    for (int i = 0; i < vl; i++) {
+                        int _idx = buf_idx[i] + 1;
+                        ihist[_idx]++;
+                    }
+                }
+                p0 += step0;
+            }
+        } else {
+            while (src_height--) {
+                int vl;
+                for (int x = 0; x < src_width; x += vl) {
+                    vl = __riscv_vsetvl_e16m2(src_width - x);
+
+                    auto v = __riscv_vfcvt_f(__riscv_vwcvtu_x(__riscv_vwcvtu_x(__riscv_vlse16_v_u16m2(p0 + x*d0, sizeof(ushort)*d0, vl), vl), vl), vl);
+
+                    auto m0 = __riscv_vmflt(v, v0_lo, vl);
+                    auto m1 = __riscv_vmfge(v, v0_hi, vl);
+                    auto m = __riscv_vmor(m0, m1, vl);
+
+                    auto fidx = __riscv_vfadd(__riscv_vfmul(v, a, vl), b, vl);
+                    auto idx = __riscv_vfncvt_x(__riscv_vfsub(fidx, 0.5f - 1e-6, vl), vl);
+                    idx = __riscv_vmerge(idx, 0, __riscv_vmslt(idx, 0, vl), vl);
+                    idx = __riscv_vmerge(idx, sz-1, __riscv_vmsgt(idx, sz-1, vl), vl);
+                    idx = __riscv_vmerge(idx, -1, m, vl);
+                    __riscv_vse32(buf_idx, idx, vl);
+
+                    for (int i = 0; i < vl; i++) {
+                        int _idx = buf_idx[i] + 1;
+                        ihist[_idx]++;
+                    }
+                }
+                p0 += step0;
+            }
+        }
+    } else if (depth == CV_32F) {
+        const float* p0 = (const float*)src_data;
+        if (d0 == 1) {
+            while (src_height--) {
+                int vl;
+                for (int x = 0; x < src_width; x += vl) {
+                    vl = __riscv_vsetvl_e32m4(src_width - x);
+
+                    auto v = __riscv_vfwcvt_f(__riscv_vle32_v_f32m4(p0 + x, vl), vl);
+
+                    auto m0 = __riscv_vmflt(v, v0_lo, vl);
+                    auto m1 = __riscv_vmfge(v, v0_hi, vl);
+                    auto m = __riscv_vmor(m0, m1, vl);
+
+                    auto fidx = __riscv_vfadd(__riscv_vfmul(v, a, vl), b, vl);
+                    auto idx = __riscv_vfncvt_x(__riscv_vfsub(fidx, 0.5f - 1e-6, vl), vl);
+                    idx = __riscv_vmerge(idx, 0, __riscv_vmslt(idx, 0, vl), vl);
+                    idx = __riscv_vmerge(idx, sz-1, __riscv_vmsgt(idx, sz-1, vl), vl);
+                    idx = __riscv_vmerge(idx, -1, m, vl);
+                    __riscv_vse32(buf_idx, idx, vl);
+
+                    for (int i = 0; i < vl; i++) {
+                        int _idx = buf_idx[i] + 1;
+                        ihist[_idx]++;
+                    }
+                }
+                p0 += step0;
+            }
+        } else {
+            while (src_height--) {
+                int vl;
+                for (int x = 0; x < src_width; x += vl) {
+                    vl = __riscv_vsetvl_e32m4(src_width - x);
+
+                    auto v = __riscv_vfwcvt_f(__riscv_vlse32_v_f32m4(p0 + x*d0, sizeof(float)*d0, vl), vl);
+
+                    auto m0 = __riscv_vmflt(v, v0_lo, vl);
+                    auto m1 = __riscv_vmfge(v, v0_hi, vl);
+                    auto m = __riscv_vmor(m0, m1, vl);
+
+                    auto fidx = __riscv_vfadd(__riscv_vfmul(v, a, vl), b, vl);
+                    auto idx = __riscv_vfncvt_x(__riscv_vfsub(fidx, 0.5f - 1e-6, vl), vl);
+                    idx = __riscv_vmerge(idx, 0, __riscv_vmslt(idx, 0, vl), vl);
+                    idx = __riscv_vmerge(idx, sz-1, __riscv_vmsgt(idx, sz-1, vl), vl);
+                    idx = __riscv_vmerge(idx, -1, m, vl);
+                    __riscv_vse32(buf_idx, idx, vl);
+
+                    for (int i = 0; i < vl; i++) {
+                        int _idx = buf_idx[i] + 1;
+                        ihist[_idx]++;
+                    }
+                }
+                p0 += step0;
+            }
+        }
+    }
+
+    if (accumulate) {
+        cvt32s32f_add32f(ihist+1, hist_data, hist_size);
+    } else {
+        std::memset(hist_data, 0, sizeof(float)*hist_size);
+        cvt_32s32f(ihist+1, hist_data, hist_size);
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+}}} // cv::rvv_hal::imgproc
--- a/hal/riscv-rvv/src/imgproc/integral.cpp
+++ b/hal/riscv-rvv/src/imgproc/integral.cpp
@ -4,16 +4,13 @@

 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.

-#ifndef OPENCV_HAL_RVV_INTEGRAL_HPP_INCLUDED
-#define OPENCV_HAL_RVV_INTEGRAL_HPP_INCLUDED
+#include "rvv_hal.hpp"

-#include <riscv_vector.h>
-#include "types.hpp"
+namespace cv { namespace rvv_hal { namespace imgproc {

-namespace cv { namespace cv_hal_rvv {
+#if CV_HAL_RVV_1P0_ENABLED

-#undef cv_hal_integral
-#define cv_hal_integral cv::cv_hal_rvv::integral
+namespace {

 template <typename vec_t>
 inline typename vec_t::VecType repeat_last_n(typename vec_t::VecType vs, int n, size_t vl) {
@ -87,6 +84,8 @@ inline int integral(const uchar* src_data, size_t src_step, uchar* sum_data, siz
    return result;
 }

+} // anonymous
+
 /**
   @brief Calculate integral image
   @param depth Depth of source image
@ -119,7 +118,7 @@ inline int integral(const uchar* src_data, size_t src_step, uchar* sum_data, siz
    CV_32F | CV_64F | CV_64F
    CV_64F | CV_64F | CV_64F
 */
-inline int integral(int depth, int sdepth, int sqdepth,
+int integral(int depth, int sdepth, int sqdepth,
             const uchar* src_data, size_t src_step,
             uchar* sum_data, size_t sum_step,
             uchar* sqsum_data, size_t sqsum_step,
@ -168,6 +167,6 @@ inline int integral(int depth, int sdepth, int sqdepth,
    return result;
 }

-}}
+#endif // CV_HAL_RVV_1P0_ENABLED

-#endif
+}}} // cv::rvv_hal::imgproc
--- a/hal/riscv-rvv/src/imgproc/median_blur.cpp
+++ b/hal/riscv-rvv/src/imgproc/median_blur.cpp
@ -0,0 +1,575 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
+
+#include "rvv_hal.hpp"
+#include "common.hpp"
+
+namespace cv { namespace rvv_hal { namespace imgproc {
+
+#if CV_HAL_RVV_1P0_ENABLED
+
+namespace {
+
+// the algorithm is copied from imgproc/src/median_blur.simd.cpp
+// in the function template static void medianBlur_SortNet
+template<int ksize, typename helper>
+static inline int medianBlurC1(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height)
+{
+    using T = typename helper::ElemType;
+    using VT = typename helper::VecType;
+
+    for (int i = start; i < end; i++)
+    {
+        const T* row0 = reinterpret_cast<const T*>(src_data + std::min(std::max(i     - ksize / 2, 0), height - 1) * src_step);
+        const T* row1 = reinterpret_cast<const T*>(src_data + std::min(std::max(i + 1 - ksize / 2, 0), height - 1) * src_step);
+        const T* row2 = reinterpret_cast<const T*>(src_data + std::min(std::max(i + 2 - ksize / 2, 0), height - 1) * src_step);
+        const T* row3 = reinterpret_cast<const T*>(src_data + std::min(std::max(i + 3 - ksize / 2, 0), height - 1) * src_step);
+        const T* row4 = reinterpret_cast<const T*>(src_data + std::min(std::max(i + 4 - ksize / 2, 0), height - 1) * src_step);
+        int vl;
+        auto vop = [&vl](VT& a, VT& b) {
+            auto t = a;
+            a = helper::vmin(a, b, vl);
+            b = helper::vmax(t, b, vl);
+        };
+
+        for (int j = 0; j < width; j += vl)
+        {
+            vl = helper::setvl(width - j);
+            if (ksize == 3)
+            {
+                VT p0, p1, p2;
+                VT p3, p4, p5;
+                VT p6, p7, p8;
+                if (j != 0)
+                {
+                    p0 = helper::vload(row0 + j - 1, vl);
+                    p3 = helper::vload(row1 + j - 1, vl);
+                    p6 = helper::vload(row2 + j - 1, vl);
+                }
+                else
+                {
+                    p0 = helper::vslide1up(helper::vload(row0, vl), row0[0], vl);
+                    p3 = helper::vslide1up(helper::vload(row1, vl), row1[0], vl);
+                    p6 = helper::vslide1up(helper::vload(row2, vl), row2[0], vl);
+                }
+                p1 = helper::vslide1down(p0, row0[j + vl - 1], vl);
+                p4 = helper::vslide1down(p3, row1[j + vl - 1], vl);
+                p7 = helper::vslide1down(p6, row2[j + vl - 1], vl);
+                p2 = helper::vslide1down(p1, row0[std::min(width - 1, j + vl)], vl);
+                p5 = helper::vslide1down(p4, row1[std::min(width - 1, j + vl)], vl);
+                p8 = helper::vslide1down(p7, row2[std::min(width - 1, j + vl)], vl);
+
+                vop(p1, p2); vop(p4, p5); vop(p7, p8); vop(p0, p1);
+                vop(p3, p4); vop(p6, p7); vop(p1, p2); vop(p4, p5);
+                vop(p7, p8); vop(p0, p3); vop(p5, p8); vop(p4, p7);
+                vop(p3, p6); vop(p1, p4); vop(p2, p5); vop(p4, p7);
+                vop(p4, p2); vop(p6, p4); vop(p4, p2);
+                helper::vstore(reinterpret_cast<T*>(dst_data + i * dst_step) + j, p4, vl);
+            }
+            else
+            {
+                VT p0, p1, p2, p3, p4;
+                VT p5, p6, p7, p8, p9;
+                VT p10, p11, p12, p13, p14;
+                VT p15, p16, p17, p18, p19;
+                VT p20, p21, p22, p23, p24;
+                if (j >= 2)
+                {
+                    p0 = helper::vload(row0 + j - 2, vl);
+                    p5 = helper::vload(row1 + j - 2, vl);
+                    p10 = helper::vload(row2 + j - 2, vl);
+                    p15 = helper::vload(row3 + j - 2, vl);
+                    p20 = helper::vload(row4 + j - 2, vl);
+                }
+                else
+                {
+                    p0 = helper::vslide1up(helper::vload(row0, vl), row0[0], vl);
+                    p5 = helper::vslide1up(helper::vload(row1, vl), row1[0], vl);
+                    p10 = helper::vslide1up(helper::vload(row2, vl), row2[0], vl);
+                    p15 = helper::vslide1up(helper::vload(row3, vl), row3[0], vl);
+                    p20 = helper::vslide1up(helper::vload(row4, vl), row4[0], vl);
+                    if (j == 0)
+                    {
+                        p0 = helper::vslide1up(p0, row0[0], vl);
+                        p5 = helper::vslide1up(p5, row1[0], vl);
+                        p10 = helper::vslide1up(p10, row2[0], vl);
+                        p15 = helper::vslide1up(p15, row3[0], vl);
+                        p20 = helper::vslide1up(p20, row4[0], vl);
+                    }
+                }
+                p1 = helper::vslide1down(p0, row0[j + vl - 2], vl);
+                p6 = helper::vslide1down(p5, row1[j + vl - 2], vl);
+                p11 = helper::vslide1down(p10, row2[j + vl - 2], vl);
+                p16 = helper::vslide1down(p15, row3[j + vl - 2], vl);
+                p21 = helper::vslide1down(p20, row4[j + vl - 2], vl);
+                p2 = helper::vslide1down(p1, row0[j + vl - 1], vl);
+                p7 = helper::vslide1down(p6, row1[j + vl - 1], vl);
+                p12 = helper::vslide1down(p11, row2[j + vl - 1], vl);
+                p17 = helper::vslide1down(p16, row3[j + vl - 1], vl);
+                p22 = helper::vslide1down(p21, row4[j + vl - 1], vl);
+                p3 = helper::vslide1down(p2, row0[std::min(width - 1, j + vl)], vl);
+                p8 = helper::vslide1down(p7, row1[std::min(width - 1, j + vl)], vl);
+                p13 = helper::vslide1down(p12, row2[std::min(width - 1, j + vl)], vl);
+                p18 = helper::vslide1down(p17, row3[std::min(width - 1, j + vl)], vl);
+                p23 = helper::vslide1down(p22, row4[std::min(width - 1, j + vl)], vl);
+                p4 = helper::vslide1down(p3, row0[std::min(width - 1, j + vl + 1)], vl);
+                p9 = helper::vslide1down(p8, row1[std::min(width - 1, j + vl + 1)], vl);
+                p14 = helper::vslide1down(p13, row2[std::min(width - 1, j + vl + 1)], vl);
+                p19 = helper::vslide1down(p18, row3[std::min(width - 1, j + vl + 1)], vl);
+                p24 = helper::vslide1down(p23, row4[std::min(width - 1, j + vl + 1)], vl);
+
+                vop(p1, p2); vop(p0, p1); vop(p1, p2); vop(p4, p5); vop(p3, p4);
+                vop(p4, p5); vop(p0, p3); vop(p2, p5); vop(p2, p3); vop(p1, p4);
+                vop(p1, p2); vop(p3, p4); vop(p7, p8); vop(p6, p7); vop(p7, p8);
+                vop(p10, p11); vop(p9, p10); vop(p10, p11); vop(p6, p9); vop(p8, p11);
+                vop(p8, p9); vop(p7, p10); vop(p7, p8); vop(p9, p10); vop(p0, p6);
+                vop(p4, p10); vop(p4, p6); vop(p2, p8); vop(p2, p4); vop(p6, p8);
+                vop(p1, p7); vop(p5, p11); vop(p5, p7); vop(p3, p9); vop(p3, p5);
+                vop(p7, p9); vop(p1, p2); vop(p3, p4); vop(p5, p6); vop(p7, p8);
+                vop(p9, p10); vop(p13, p14); vop(p12, p13); vop(p13, p14); vop(p16, p17);
+                vop(p15, p16); vop(p16, p17); vop(p12, p15); vop(p14, p17); vop(p14, p15);
+                vop(p13, p16); vop(p13, p14); vop(p15, p16); vop(p19, p20); vop(p18, p19);
+                vop(p19, p20); vop(p21, p22); vop(p23, p24); vop(p21, p23); vop(p22, p24);
+                vop(p22, p23); vop(p18, p21); vop(p20, p23); vop(p20, p21); vop(p19, p22);
+                vop(p22, p24); vop(p19, p20); vop(p21, p22); vop(p23, p24); vop(p12, p18);
+                vop(p16, p22); vop(p16, p18); vop(p14, p20); vop(p20, p24); vop(p14, p16);
+                vop(p18, p20); vop(p22, p24); vop(p13, p19); vop(p17, p23); vop(p17, p19);
+                vop(p15, p21); vop(p15, p17); vop(p19, p21); vop(p13, p14); vop(p15, p16);
+                vop(p17, p18); vop(p19, p20); vop(p21, p22); vop(p23, p24); vop(p0, p12);
+                vop(p8, p20); vop(p8, p12); vop(p4, p16); vop(p16, p24); vop(p12, p16);
+                vop(p2, p14); vop(p10, p22); vop(p10, p14); vop(p6, p18); vop(p6, p10);
+                vop(p10, p12); vop(p1, p13); vop(p9, p21); vop(p9, p13); vop(p5, p17);
+                vop(p13, p17); vop(p3, p15); vop(p11, p23); vop(p11, p15); vop(p7, p19);
+                vop(p7, p11); vop(p11, p13); vop(p11, p12);
+                helper::vstore(reinterpret_cast<T*>(dst_data + i * dst_step) + j, p12, vl);
+            }
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+template<int ksize>
+static inline int medianBlurC4(int start, int end, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height)
+{
+    for (int i = start; i < end; i++)
+    {
+        const uchar* row0 = src_data + std::min(std::max(i     - ksize / 2, 0), height - 1) * src_step;
+        const uchar* row1 = src_data + std::min(std::max(i + 1 - ksize / 2, 0), height - 1) * src_step;
+        const uchar* row2 = src_data + std::min(std::max(i + 2 - ksize / 2, 0), height - 1) * src_step;
+        const uchar* row3 = src_data + std::min(std::max(i + 3 - ksize / 2, 0), height - 1) * src_step;
+        const uchar* row4 = src_data + std::min(std::max(i + 4 - ksize / 2, 0), height - 1) * src_step;
+        int vl;
+        for (int j = 0; j < width; j += vl)
+        {
+            if (ksize == 3)
+            {
+                vl = __riscv_vsetvl_e8m1(width - j);
+                vuint8m1_t p00, p01, p02;
+                vuint8m1_t p03, p04, p05;
+                vuint8m1_t p06, p07, p08;
+                vuint8m1_t p10, p11, p12;
+                vuint8m1_t p13, p14, p15;
+                vuint8m1_t p16, p17, p18;
+                vuint8m1_t p20, p21, p22;
+                vuint8m1_t p23, p24, p25;
+                vuint8m1_t p26, p27, p28;
+                vuint8m1_t p30, p31, p32;
+                vuint8m1_t p33, p34, p35;
+                vuint8m1_t p36, p37, p38;
+                auto loadsrc = [&vl](const uchar* row, vuint8m1_t& p0, vuint8m1_t& p1, vuint8m1_t& p2, vuint8m1_t& p3) {
+                    auto src = __riscv_vlseg4e8_v_u8m1x4(row, vl);
+                    p0 = __riscv_vget_v_u8m1x4_u8m1(src, 0);
+                    p1 = __riscv_vget_v_u8m1x4_u8m1(src, 1);
+                    p2 = __riscv_vget_v_u8m1x4_u8m1(src, 2);
+                    p3 = __riscv_vget_v_u8m1x4_u8m1(src, 3);
+                };
+                if (j != 0)
+                {
+                    loadsrc(row0 + (j - 1) * 4, p00, p10, p20, p30);
+                    loadsrc(row1 + (j - 1) * 4, p03, p13, p23, p33);
+                    loadsrc(row2 + (j - 1) * 4, p06, p16, p26, p36);
+                }
+                else
+                {
+                    loadsrc(row0, p00, p10, p20, p30);
+                    loadsrc(row1, p03, p13, p23, p33);
+                    loadsrc(row2, p06, p16, p26, p36);
+                    p00 = __riscv_vslide1up(p00, row0[0], vl);
+                    p10 = __riscv_vslide1up(p10, row0[1], vl);
+                    p20 = __riscv_vslide1up(p20, row0[2], vl);
+                    p30 = __riscv_vslide1up(p30, row0[3], vl);
+                    p03 = __riscv_vslide1up(p03, row1[0], vl);
+                    p13 = __riscv_vslide1up(p13, row1[1], vl);
+                    p23 = __riscv_vslide1up(p23, row1[2], vl);
+                    p33 = __riscv_vslide1up(p33, row1[3], vl);
+                    p06 = __riscv_vslide1up(p06, row2[0], vl);
+                    p16 = __riscv_vslide1up(p16, row2[1], vl);
+                    p26 = __riscv_vslide1up(p26, row2[2], vl);
+                    p36 = __riscv_vslide1up(p36, row2[3], vl);
+                }
+                p01 = __riscv_vslide1down(p00, row0[(j + vl - 1) * 4    ], vl);
+                p11 = __riscv_vslide1down(p10, row0[(j + vl - 1) * 4 + 1], vl);
+                p21 = __riscv_vslide1down(p20, row0[(j + vl - 1) * 4 + 2], vl);
+                p31 = __riscv_vslide1down(p30, row0[(j + vl - 1) * 4 + 3], vl);
+                p04 = __riscv_vslide1down(p03, row1[(j + vl - 1) * 4    ], vl);
+                p14 = __riscv_vslide1down(p13, row1[(j + vl - 1) * 4 + 1], vl);
+                p24 = __riscv_vslide1down(p23, row1[(j + vl - 1) * 4 + 2], vl);
+                p34 = __riscv_vslide1down(p33, row1[(j + vl - 1) * 4 + 3], vl);
+                p07 = __riscv_vslide1down(p06, row2[(j + vl - 1) * 4    ], vl);
+                p17 = __riscv_vslide1down(p16, row2[(j + vl - 1) * 4 + 1], vl);
+                p27 = __riscv_vslide1down(p26, row2[(j + vl - 1) * 4 + 2], vl);
+                p37 = __riscv_vslide1down(p36, row2[(j + vl - 1) * 4 + 3], vl);
+                p02 = __riscv_vslide1down(p01, row0[std::min(width - 1, j + vl) * 4    ], vl);
+                p12 = __riscv_vslide1down(p11, row0[std::min(width - 1, j + vl) * 4 + 1], vl);
+                p22 = __riscv_vslide1down(p21, row0[std::min(width - 1, j + vl) * 4 + 2], vl);
+                p32 = __riscv_vslide1down(p31, row0[std::min(width - 1, j + vl) * 4 + 3], vl);
+                p05 = __riscv_vslide1down(p04, row1[std::min(width - 1, j + vl) * 4    ], vl);
+                p15 = __riscv_vslide1down(p14, row1[std::min(width - 1, j + vl) * 4 + 1], vl);
+                p25 = __riscv_vslide1down(p24, row1[std::min(width - 1, j + vl) * 4 + 2], vl);
+                p35 = __riscv_vslide1down(p34, row1[std::min(width - 1, j + vl) * 4 + 3], vl);
+                p08 = __riscv_vslide1down(p07, row2[std::min(width - 1, j + vl) * 4    ], vl);
+                p18 = __riscv_vslide1down(p17, row2[std::min(width - 1, j + vl) * 4 + 1], vl);
+                p28 = __riscv_vslide1down(p27, row2[std::min(width - 1, j + vl) * 4 + 2], vl);
+                p38 = __riscv_vslide1down(p37, row2[std::min(width - 1, j + vl) * 4 + 3], vl);
+
+                auto vop = [&vl](vuint8m1_t& a, vuint8m1_t& b) {
+                    auto t = a;
+                    a = __riscv_vminu(a, b, vl);
+                    b = __riscv_vmaxu(t, b, vl);
+                };
+                vuint8m1x4_t dst{};
+                vop(p01, p02); vop(p04, p05); vop(p07, p08); vop(p00, p01);
+                vop(p03, p04); vop(p06, p07); vop(p01, p02); vop(p04, p05);
+                vop(p07, p08); vop(p00, p03); vop(p05, p08); vop(p04, p07);
+                vop(p03, p06); vop(p01, p04); vop(p02, p05); vop(p04, p07);
+                vop(p04, p02); vop(p06, p04); vop(p04, p02);
+                dst = __riscv_vset_v_u8m1_u8m1x4(dst, 0, p04);
+                vop(p11, p12); vop(p14, p15); vop(p17, p18); vop(p10, p11);
+                vop(p13, p14); vop(p16, p17); vop(p11, p12); vop(p14, p15);
+                vop(p17, p18); vop(p10, p13); vop(p15, p18); vop(p14, p17);
+                vop(p13, p16); vop(p11, p14); vop(p12, p15); vop(p14, p17);
+                vop(p14, p12); vop(p16, p14); vop(p14, p12);
+                dst = __riscv_vset_v_u8m1_u8m1x4(dst, 1, p14);
+                vop(p21, p22); vop(p24, p25); vop(p27, p28); vop(p20, p21);
+                vop(p23, p24); vop(p26, p27); vop(p21, p22); vop(p24, p25);
+                vop(p27, p28); vop(p20, p23); vop(p25, p28); vop(p24, p27);
+                vop(p23, p26); vop(p21, p24); vop(p22, p25); vop(p24, p27);
+                vop(p24, p22); vop(p26, p24); vop(p24, p22);
+                dst = __riscv_vset_v_u8m1_u8m1x4(dst, 2, p24);
+                vop(p31, p32); vop(p34, p35); vop(p37, p38); vop(p30, p31);
+                vop(p33, p34); vop(p36, p37); vop(p31, p32); vop(p34, p35);
+                vop(p37, p38); vop(p30, p33); vop(p35, p38); vop(p34, p37);
+                vop(p33, p36); vop(p31, p34); vop(p32, p35); vop(p34, p37);
+                vop(p34, p32); vop(p36, p34); vop(p34, p32);
+                dst = __riscv_vset_v_u8m1_u8m1x4(dst, 3, p34);
+                __riscv_vsseg4e8(dst_data + i * dst_step + j * 4, dst, vl);
+            }
+            else
+            {
+                vl = __riscv_vsetvl_e8m2(width - j);
+                vuint8m2_t p00, p01, p02, p03, p04;
+                vuint8m2_t p05, p06, p07, p08, p09;
+                vuint8m2_t p010, p011, p012, p013, p014;
+                vuint8m2_t p015, p016, p017, p018, p019;
+                vuint8m2_t p020, p021, p022, p023, p024;
+                vuint8m2_t p10, p11, p12, p13, p14;
+                vuint8m2_t p15, p16, p17, p18, p19;
+                vuint8m2_t p110, p111, p112, p113, p114;
+                vuint8m2_t p115, p116, p117, p118, p119;
+                vuint8m2_t p120, p121, p122, p123, p124;
+                vuint8m2_t p20, p21, p22, p23, p24;
+                vuint8m2_t p25, p26, p27, p28, p29;
+                vuint8m2_t p210, p211, p212, p213, p214;
+                vuint8m2_t p215, p216, p217, p218, p219;
+                vuint8m2_t p220, p221, p222, p223, p224;
+                vuint8m2_t p30, p31, p32, p33, p34;
+                vuint8m2_t p35, p36, p37, p38, p39;
+                vuint8m2_t p310, p311, p312, p313, p314;
+                vuint8m2_t p315, p316, p317, p318, p319;
+                vuint8m2_t p320, p321, p322, p323, p324;
+                auto loadsrc = [&vl](const uchar* row, vuint8m2_t& p0, vuint8m2_t& p1, vuint8m2_t& p2, vuint8m2_t& p3) {
+                    auto src = __riscv_vlseg4e8_v_u8m2x4(row, vl);
+                    p0 = __riscv_vget_v_u8m2x4_u8m2(src, 0);
+                    p1 = __riscv_vget_v_u8m2x4_u8m2(src, 1);
+                    p2 = __riscv_vget_v_u8m2x4_u8m2(src, 2);
+                    p3 = __riscv_vget_v_u8m2x4_u8m2(src, 3);
+                };
+                if (j >= 2)
+                {
+                    loadsrc(row0 + (j - 2) * 4, p00, p10, p20, p30);
+                    loadsrc(row1 + (j - 2) * 4, p05, p15, p25, p35);
+                    loadsrc(row2 + (j - 2) * 4, p010, p110, p210, p310);
+                    loadsrc(row3 + (j - 2) * 4, p015, p115, p215, p315);
+                    loadsrc(row4 + (j - 2) * 4, p020, p120, p220, p320);
+                }
+                else
+                {
+                    loadsrc(row0, p00, p10, p20, p30);
+                    loadsrc(row1, p05, p15, p25, p35);
+                    loadsrc(row2, p010, p110, p210, p310);
+                    loadsrc(row3, p015, p115, p215, p315);
+                    loadsrc(row4, p020, p120, p220, p320);
+                    auto slideup = [&] {
+                        p00 = __riscv_vslide1up(p00, row0[0], vl);
+                        p10 = __riscv_vslide1up(p10, row0[1], vl);
+                        p20 = __riscv_vslide1up(p20, row0[2], vl);
+                        p30 = __riscv_vslide1up(p30, row0[3], vl);
+                        p05 = __riscv_vslide1up(p05, row1[0], vl);
+                        p15 = __riscv_vslide1up(p15, row1[1], vl);
+                        p25 = __riscv_vslide1up(p25, row1[2], vl);
+                        p35 = __riscv_vslide1up(p35, row1[3], vl);
+                        p010 = __riscv_vslide1up(p010, row2[0], vl);
+                        p110 = __riscv_vslide1up(p110, row2[1], vl);
+                        p210 = __riscv_vslide1up(p210, row2[2], vl);
+                        p310 = __riscv_vslide1up(p310, row2[3], vl);
+                        p015 = __riscv_vslide1up(p015, row3[0], vl);
+                        p115 = __riscv_vslide1up(p115, row3[1], vl);
+                        p215 = __riscv_vslide1up(p215, row3[2], vl);
+                        p315 = __riscv_vslide1up(p315, row3[3], vl);
+                        p020 = __riscv_vslide1up(p020, row4[0], vl);
+                        p120 = __riscv_vslide1up(p120, row4[1], vl);
+                        p220 = __riscv_vslide1up(p220, row4[2], vl);
+                        p320 = __riscv_vslide1up(p320, row4[3], vl);
+                    };
+                    slideup();
+                    if (j == 0)
+                    {
+                        slideup();
+                    }
+                }
+                p01 = __riscv_vslide1down(p00, row0[(j + vl - 2) * 4    ], vl);
+                p11 = __riscv_vslide1down(p10, row0[(j + vl - 2) * 4 + 1], vl);
+                p21 = __riscv_vslide1down(p20, row0[(j + vl - 2) * 4 + 2], vl);
+                p31 = __riscv_vslide1down(p30, row0[(j + vl - 2) * 4 + 3], vl);
+                p06 = __riscv_vslide1down(p05, row1[(j + vl - 2) * 4    ], vl);
+                p16 = __riscv_vslide1down(p15, row1[(j + vl - 2) * 4 + 1], vl);
+                p26 = __riscv_vslide1down(p25, row1[(j + vl - 2) * 4 + 2], vl);
+                p36 = __riscv_vslide1down(p35, row1[(j + vl - 2) * 4 + 3], vl);
+                p011 = __riscv_vslide1down(p010, row2[(j + vl - 2) * 4    ], vl);
+                p111 = __riscv_vslide1down(p110, row2[(j + vl - 2) * 4 + 1], vl);
+                p211 = __riscv_vslide1down(p210, row2[(j + vl - 2) * 4 + 2], vl);
+                p311 = __riscv_vslide1down(p310, row2[(j + vl - 2) * 4 + 3], vl);
+                p016 = __riscv_vslide1down(p015, row3[(j + vl - 2) * 4    ], vl);
+                p116 = __riscv_vslide1down(p115, row3[(j + vl - 2) * 4 + 1], vl);
+                p216 = __riscv_vslide1down(p215, row3[(j + vl - 2) * 4 + 2], vl);
+                p316 = __riscv_vslide1down(p315, row3[(j + vl - 2) * 4 + 3], vl);
+                p021 = __riscv_vslide1down(p020, row4[(j + vl - 2) * 4    ], vl);
+                p121 = __riscv_vslide1down(p120, row4[(j + vl - 2) * 4 + 1], vl);
+                p221 = __riscv_vslide1down(p220, row4[(j + vl - 2) * 4 + 2], vl);
+                p321 = __riscv_vslide1down(p320, row4[(j + vl - 2) * 4 + 3], vl);
+                p02 = __riscv_vslide1down(p01, row0[(j + vl - 1) * 4    ], vl);
+                p12 = __riscv_vslide1down(p11, row0[(j + vl - 1) * 4 + 1], vl);
+                p22 = __riscv_vslide1down(p21, row0[(j + vl - 1) * 4 + 2], vl);
+                p32 = __riscv_vslide1down(p31, row0[(j + vl - 1) * 4 + 3], vl);
+                p07 = __riscv_vslide1down(p06, row1[(j + vl - 1) * 4    ], vl);
+                p17 = __riscv_vslide1down(p16, row1[(j + vl - 1) * 4 + 1], vl);
+                p27 = __riscv_vslide1down(p26, row1[(j + vl - 1) * 4 + 2], vl);
+                p37 = __riscv_vslide1down(p36, row1[(j + vl - 1) * 4 + 3], vl);
+                p012 = __riscv_vslide1down(p011, row2[(j + vl - 1) * 4    ], vl);
+                p112 = __riscv_vslide1down(p111, row2[(j + vl - 1) * 4 + 1], vl);
+                p212 = __riscv_vslide1down(p211, row2[(j + vl - 1) * 4 + 2], vl);
+                p312 = __riscv_vslide1down(p311, row2[(j + vl - 1) * 4 + 3], vl);
+                p017 = __riscv_vslide1down(p016, row3[(j + vl - 1) * 4    ], vl);
+                p117 = __riscv_vslide1down(p116, row3[(j + vl - 1) * 4 + 1], vl);
+                p217 = __riscv_vslide1down(p216, row3[(j + vl - 1) * 4 + 2], vl);
+                p317 = __riscv_vslide1down(p316, row3[(j + vl - 1) * 4 + 3], vl);
+                p022 = __riscv_vslide1down(p021, row4[(j + vl - 1) * 4    ], vl);
+                p122 = __riscv_vslide1down(p121, row4[(j + vl - 1) * 4 + 1], vl);
+                p222 = __riscv_vslide1down(p221, row4[(j + vl - 1) * 4 + 2], vl);
+                p322 = __riscv_vslide1down(p321, row4[(j + vl - 1) * 4 + 3], vl);
+                p03 = __riscv_vslide1down(p02, row0[std::min(width - 1, j + vl) * 4    ], vl);
+                p13 = __riscv_vslide1down(p12, row0[std::min(width - 1, j + vl) * 4 + 1], vl);
+                p23 = __riscv_vslide1down(p22, row0[std::min(width - 1, j + vl) * 4 + 2], vl);
+                p33 = __riscv_vslide1down(p32, row0[std::min(width - 1, j + vl) * 4 + 3], vl);
+                p08 = __riscv_vslide1down(p07, row1[std::min(width - 1, j + vl) * 4    ], vl);
+                p18 = __riscv_vslide1down(p17, row1[std::min(width - 1, j + vl) * 4 + 1], vl);
+                p28 = __riscv_vslide1down(p27, row1[std::min(width - 1, j + vl) * 4 + 2], vl);
+                p38 = __riscv_vslide1down(p37, row1[std::min(width - 1, j + vl) * 4 + 3], vl);
+                p013 = __riscv_vslide1down(p012, row2[std::min(width - 1, j + vl) * 4    ], vl);
+                p113 = __riscv_vslide1down(p112, row2[std::min(width - 1, j + vl) * 4 + 1], vl);
+                p213 = __riscv_vslide1down(p212, row2[std::min(width - 1, j + vl) * 4 + 2], vl);
+                p313 = __riscv_vslide1down(p312, row2[std::min(width - 1, j + vl) * 4 + 3], vl);
+                p018 = __riscv_vslide1down(p017, row3[std::min(width - 1, j + vl) * 4    ], vl);
+                p118 = __riscv_vslide1down(p117, row3[std::min(width - 1, j + vl) * 4 + 1], vl);
+                p218 = __riscv_vslide1down(p217, row3[std::min(width - 1, j + vl) * 4 + 2], vl);
+                p318 = __riscv_vslide1down(p317, row3[std::min(width - 1, j + vl) * 4 + 3], vl);
+                p023 = __riscv_vslide1down(p022, row4[std::min(width - 1, j + vl) * 4    ], vl);
+                p123 = __riscv_vslide1down(p122, row4[std::min(width - 1, j + vl) * 4 + 1], vl);
+                p223 = __riscv_vslide1down(p222, row4[std::min(width - 1, j + vl) * 4 + 2], vl);
+                p323 = __riscv_vslide1down(p322, row4[std::min(width - 1, j + vl) * 4 + 3], vl);
+                p04 = __riscv_vslide1down(p03, row0[std::min(width - 1, j + vl + 1) * 4    ], vl);
+                p14 = __riscv_vslide1down(p13, row0[std::min(width - 1, j + vl + 1) * 4 + 1], vl);
+                p24 = __riscv_vslide1down(p23, row0[std::min(width - 1, j + vl + 1) * 4 + 2], vl);
+                p34 = __riscv_vslide1down(p33, row0[std::min(width - 1, j + vl + 1) * 4 + 3], vl);
+                p09 = __riscv_vslide1down(p08, row1[std::min(width - 1, j + vl + 1) * 4    ], vl);
+                p19 = __riscv_vslide1down(p18, row1[std::min(width - 1, j + vl + 1) * 4 + 1], vl);
+                p29 = __riscv_vslide1down(p28, row1[std::min(width - 1, j + vl + 1) * 4 + 2], vl);
+                p39 = __riscv_vslide1down(p38, row1[std::min(width - 1, j + vl + 1) * 4 + 3], vl);
+                p014 = __riscv_vslide1down(p013, row2[std::min(width - 1, j + vl + 1) * 4    ], vl);
+                p114 = __riscv_vslide1down(p113, row2[std::min(width - 1, j + vl + 1) * 4 + 1], vl);
+                p214 = __riscv_vslide1down(p213, row2[std::min(width - 1, j + vl + 1) * 4 + 2], vl);
+                p314 = __riscv_vslide1down(p313, row2[std::min(width - 1, j + vl + 1) * 4 + 3], vl);
+                p019 = __riscv_vslide1down(p018, row3[std::min(width - 1, j + vl + 1) * 4    ], vl);
+                p119 = __riscv_vslide1down(p118, row3[std::min(width - 1, j + vl + 1) * 4 + 1], vl);
+                p219 = __riscv_vslide1down(p218, row3[std::min(width - 1, j + vl + 1) * 4 + 2], vl);
+                p319 = __riscv_vslide1down(p318, row3[std::min(width - 1, j + vl + 1) * 4 + 3], vl);
+                p024 = __riscv_vslide1down(p023, row4[std::min(width - 1, j + vl + 1) * 4    ], vl);
+                p124 = __riscv_vslide1down(p123, row4[std::min(width - 1, j + vl + 1) * 4 + 1], vl);
+                p224 = __riscv_vslide1down(p223, row4[std::min(width - 1, j + vl + 1) * 4 + 2], vl);
+                p324 = __riscv_vslide1down(p323, row4[std::min(width - 1, j + vl + 1) * 4 + 3], vl);
+
+                auto vop = [&vl](vuint8m2_t& a, vuint8m2_t& b) {
+                    auto t = a;
+                    a = __riscv_vminu(a, b, vl);
+                    b = __riscv_vmaxu(t, b, vl);
+                };
+                vuint8m2x4_t dst{};
+                vop(p01, p02); vop(p00, p01); vop(p01, p02); vop(p04, p05); vop(p03, p04);
+                vop(p04, p05); vop(p00, p03); vop(p02, p05); vop(p02, p03); vop(p01, p04);
+                vop(p01, p02); vop(p03, p04); vop(p07, p08); vop(p06, p07); vop(p07, p08);
+                vop(p010, p011); vop(p09, p010); vop(p010, p011); vop(p06, p09); vop(p08, p011);
+                vop(p08, p09); vop(p07, p010); vop(p07, p08); vop(p09, p010); vop(p00, p06);
+                vop(p04, p010); vop(p04, p06); vop(p02, p08); vop(p02, p04); vop(p06, p08);
+                vop(p01, p07); vop(p05, p011); vop(p05, p07); vop(p03, p09); vop(p03, p05);
+                vop(p07, p09); vop(p01, p02); vop(p03, p04); vop(p05, p06); vop(p07, p08);
+                vop(p09, p010); vop(p013, p014); vop(p012, p013); vop(p013, p014); vop(p016, p017);
+                vop(p015, p016); vop(p016, p017); vop(p012, p015); vop(p014, p017); vop(p014, p015);
+                vop(p013, p016); vop(p013, p014); vop(p015, p016); vop(p019, p020); vop(p018, p019);
+                vop(p019, p020); vop(p021, p022); vop(p023, p024); vop(p021, p023); vop(p022, p024);
+                vop(p022, p023); vop(p018, p021); vop(p020, p023); vop(p020, p021); vop(p019, p022);
+                vop(p022, p024); vop(p019, p020); vop(p021, p022); vop(p023, p024); vop(p012, p018);
+                vop(p016, p022); vop(p016, p018); vop(p014, p020); vop(p020, p024); vop(p014, p016);
+                vop(p018, p020); vop(p022, p024); vop(p013, p019); vop(p017, p023); vop(p017, p019);
+                vop(p015, p021); vop(p015, p017); vop(p019, p021); vop(p013, p014); vop(p015, p016);
+                vop(p017, p018); vop(p019, p020); vop(p021, p022); vop(p023, p024); vop(p00, p012);
+                vop(p08, p020); vop(p08, p012); vop(p04, p016); vop(p016, p024); vop(p012, p016);
+                vop(p02, p014); vop(p010, p022); vop(p010, p014); vop(p06, p018); vop(p06, p010);
+                vop(p010, p012); vop(p01, p013); vop(p09, p021); vop(p09, p013); vop(p05, p017);
+                vop(p013, p017); vop(p03, p015); vop(p011, p023); vop(p011, p015); vop(p07, p019);
+                vop(p07, p011); vop(p011, p013); vop(p011, p012);
+                dst = __riscv_vset_v_u8m2_u8m2x4(dst, 0, p012);
+                vop(p11, p12); vop(p10, p11); vop(p11, p12); vop(p14, p15); vop(p13, p14);
+                vop(p14, p15); vop(p10, p13); vop(p12, p15); vop(p12, p13); vop(p11, p14);
+                vop(p11, p12); vop(p13, p14); vop(p17, p18); vop(p16, p17); vop(p17, p18);
+                vop(p110, p111); vop(p19, p110); vop(p110, p111); vop(p16, p19); vop(p18, p111);
+                vop(p18, p19); vop(p17, p110); vop(p17, p18); vop(p19, p110); vop(p10, p16);
+                vop(p14, p110); vop(p14, p16); vop(p12, p18); vop(p12, p14); vop(p16, p18);
+                vop(p11, p17); vop(p15, p111); vop(p15, p17); vop(p13, p19); vop(p13, p15);
+                vop(p17, p19); vop(p11, p12); vop(p13, p14); vop(p15, p16); vop(p17, p18);
+                vop(p19, p110); vop(p113, p114); vop(p112, p113); vop(p113, p114); vop(p116, p117);
+                vop(p115, p116); vop(p116, p117); vop(p112, p115); vop(p114, p117); vop(p114, p115);
+                vop(p113, p116); vop(p113, p114); vop(p115, p116); vop(p119, p120); vop(p118, p119);
+                vop(p119, p120); vop(p121, p122); vop(p123, p124); vop(p121, p123); vop(p122, p124);
+                vop(p122, p123); vop(p118, p121); vop(p120, p123); vop(p120, p121); vop(p119, p122);
+                vop(p122, p124); vop(p119, p120); vop(p121, p122); vop(p123, p124); vop(p112, p118);
+                vop(p116, p122); vop(p116, p118); vop(p114, p120); vop(p120, p124); vop(p114, p116);
+                vop(p118, p120); vop(p122, p124); vop(p113, p119); vop(p117, p123); vop(p117, p119);
+                vop(p115, p121); vop(p115, p117); vop(p119, p121); vop(p113, p114); vop(p115, p116);
+                vop(p117, p118); vop(p119, p120); vop(p121, p122); vop(p123, p124); vop(p10, p112);
+                vop(p18, p120); vop(p18, p112); vop(p14, p116); vop(p116, p124); vop(p112, p116);
+                vop(p12, p114); vop(p110, p122); vop(p110, p114); vop(p16, p118); vop(p16, p110);
+                vop(p110, p112); vop(p11, p113); vop(p19, p121); vop(p19, p113); vop(p15, p117);
+                vop(p113, p117); vop(p13, p115); vop(p111, p123); vop(p111, p115); vop(p17, p119);
+                vop(p17, p111); vop(p111, p113); vop(p111, p112);
+                dst = __riscv_vset_v_u8m2_u8m2x4(dst, 1, p112);
+                vop(p21, p22); vop(p20, p21); vop(p21, p22); vop(p24, p25); vop(p23, p24);
+                vop(p24, p25); vop(p20, p23); vop(p22, p25); vop(p22, p23); vop(p21, p24);
+                vop(p21, p22); vop(p23, p24); vop(p27, p28); vop(p26, p27); vop(p27, p28);
+                vop(p210, p211); vop(p29, p210); vop(p210, p211); vop(p26, p29); vop(p28, p211);
+                vop(p28, p29); vop(p27, p210); vop(p27, p28); vop(p29, p210); vop(p20, p26);
+                vop(p24, p210); vop(p24, p26); vop(p22, p28); vop(p22, p24); vop(p26, p28);
+                vop(p21, p27); vop(p25, p211); vop(p25, p27); vop(p23, p29); vop(p23, p25);
+                vop(p27, p29); vop(p21, p22); vop(p23, p24); vop(p25, p26); vop(p27, p28);
+                vop(p29, p210); vop(p213, p214); vop(p212, p213); vop(p213, p214); vop(p216, p217);
+                vop(p215, p216); vop(p216, p217); vop(p212, p215); vop(p214, p217); vop(p214, p215);
+                vop(p213, p216); vop(p213, p214); vop(p215, p216); vop(p219, p220); vop(p218, p219);
+                vop(p219, p220); vop(p221, p222); vop(p223, p224); vop(p221, p223); vop(p222, p224);
+                vop(p222, p223); vop(p218, p221); vop(p220, p223); vop(p220, p221); vop(p219, p222);
+                vop(p222, p224); vop(p219, p220); vop(p221, p222); vop(p223, p224); vop(p212, p218);
+                vop(p216, p222); vop(p216, p218); vop(p214, p220); vop(p220, p224); vop(p214, p216);
+                vop(p218, p220); vop(p222, p224); vop(p213, p219); vop(p217, p223); vop(p217, p219);
+                vop(p215, p221); vop(p215, p217); vop(p219, p221); vop(p213, p214); vop(p215, p216);
+                vop(p217, p218); vop(p219, p220); vop(p221, p222); vop(p223, p224); vop(p20, p212);
+                vop(p28, p220); vop(p28, p212); vop(p24, p216); vop(p216, p224); vop(p212, p216);
+                vop(p22, p214); vop(p210, p222); vop(p210, p214); vop(p26, p218); vop(p26, p210);
+                vop(p210, p212); vop(p21, p213); vop(p29, p221); vop(p29, p213); vop(p25, p217);
+                vop(p213, p217); vop(p23, p215); vop(p211, p223); vop(p211, p215); vop(p27, p219);
+                vop(p27, p211); vop(p211, p213); vop(p211, p212);
+                dst = __riscv_vset_v_u8m2_u8m2x4(dst, 2, p212);
+                vop(p31, p32); vop(p30, p31); vop(p31, p32); vop(p34, p35); vop(p33, p34);
+                vop(p34, p35); vop(p30, p33); vop(p32, p35); vop(p32, p33); vop(p31, p34);
+                vop(p31, p32); vop(p33, p34); vop(p37, p38); vop(p36, p37); vop(p37, p38);
+                vop(p310, p311); vop(p39, p310); vop(p310, p311); vop(p36, p39); vop(p38, p311);
+                vop(p38, p39); vop(p37, p310); vop(p37, p38); vop(p39, p310); vop(p30, p36);
+                vop(p34, p310); vop(p34, p36); vop(p32, p38); vop(p32, p34); vop(p36, p38);
+                vop(p31, p37); vop(p35, p311); vop(p35, p37); vop(p33, p39); vop(p33, p35);
+                vop(p37, p39); vop(p31, p32); vop(p33, p34); vop(p35, p36); vop(p37, p38);
+                vop(p39, p310); vop(p313, p314); vop(p312, p313); vop(p313, p314); vop(p316, p317);
+                vop(p315, p316); vop(p316, p317); vop(p312, p315); vop(p314, p317); vop(p314, p315);
+                vop(p313, p316); vop(p313, p314); vop(p315, p316); vop(p319, p320); vop(p318, p319);
+                vop(p319, p320); vop(p321, p322); vop(p323, p324); vop(p321, p323); vop(p322, p324);
+                vop(p322, p323); vop(p318, p321); vop(p320, p323); vop(p320, p321); vop(p319, p322);
+                vop(p322, p324); vop(p319, p320); vop(p321, p322); vop(p323, p324); vop(p312, p318);
+                vop(p316, p322); vop(p316, p318); vop(p314, p320); vop(p320, p324); vop(p314, p316);
+                vop(p318, p320); vop(p322, p324); vop(p313, p319); vop(p317, p323); vop(p317, p319);
+                vop(p315, p321); vop(p315, p317); vop(p319, p321); vop(p313, p314); vop(p315, p316);
+                vop(p317, p318); vop(p319, p320); vop(p321, p322); vop(p323, p324); vop(p30, p312);
+                vop(p38, p320); vop(p38, p312); vop(p34, p316); vop(p316, p324); vop(p312, p316);
+                vop(p32, p314); vop(p310, p322); vop(p310, p314); vop(p36, p318); vop(p36, p310);
+                vop(p310, p312); vop(p31, p313); vop(p39, p321); vop(p39, p313); vop(p35, p317);
+                vop(p313, p317); vop(p33, p315); vop(p311, p323); vop(p311, p315); vop(p37, p319);
+                vop(p37, p311); vop(p311, p313); vop(p311, p312);
+                dst = __riscv_vset_v_u8m2_u8m2x4(dst, 3, p312);
+                __riscv_vsseg4e8(dst_data + i * dst_step + j * 4, dst, vl);
+            }
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+} // anonymous
+
+int medianBlur(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int depth, int cn, int ksize)
+{
+    const int type = CV_MAKETYPE(depth, cn);
+    if (type != CV_8UC1 && type != CV_8UC4 && type != CV_16UC1 && type != CV_16SC1 && type != CV_32FC1)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    if ((ksize != 3 && ksize != 5) || src_data == dst_data)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    switch (ksize*100 + type)
+    {
+    case 300 + CV_8UC1:
+        return common::invoke(height, {medianBlurC1<3, RVV_U8M4>}, src_data, src_step, dst_data, dst_step, width, height);
+    case 300 + CV_16UC1:
+        return common::invoke(height, {medianBlurC1<3, RVV_U16M4>}, src_data, src_step, dst_data, dst_step, width, height);
+    case 300 + CV_16SC1:
+        return common::invoke(height, {medianBlurC1<3, RVV_I16M4>}, src_data, src_step, dst_data, dst_step, width, height);
+    case 300 + CV_32FC1:
+        return common::invoke(height, {medianBlurC1<3, RVV_F32M4>}, src_data, src_step, dst_data, dst_step, width, height);
+    case 500 + CV_8UC1:
+        return common::invoke(height, {medianBlurC1<5, RVV_U8M1>}, src_data, src_step, dst_data, dst_step, width, height);
+    case 500 + CV_16UC1:
+        return common::invoke(height, {medianBlurC1<5, RVV_U16M1>}, src_data, src_step, dst_data, dst_step, width, height);
+    case 500 + CV_16SC1:
+        return common::invoke(height, {medianBlurC1<5, RVV_I16M1>}, src_data, src_step, dst_data, dst_step, width, height);
+    case 500 + CV_32FC1:
+        return common::invoke(height, {medianBlurC1<5, RVV_F32M1>}, src_data, src_step, dst_data, dst_step, width, height);
+
+    case 300 + CV_8UC4:
+        return common::invoke(height, {medianBlurC4<3>}, src_data, src_step, dst_data, dst_step, width, height);
+    case 500 + CV_8UC4:
+        return common::invoke(height, {medianBlurC4<5>}, src_data, src_step, dst_data, dst_step, width, height);
+    }
+
+    return CV_HAL_ERROR_NOT_IMPLEMENTED;
+}
+
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+}}} // cv::rvv_hal::imgproc
--- a/hal/riscv-rvv/src/imgproc/moments.cpp
+++ b/hal/riscv-rvv/src/imgproc/moments.cpp
@ -4,16 +4,13 @@

 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.

-#ifndef OPENCV_HAL_RVV_MOMENTS_HPP_INCLUDED
-#define OPENCV_HAL_RVV_MOMENTS_HPP_INCLUDED
+#include "rvv_hal.hpp"

-#include <riscv_vector.h>
+namespace cv { namespace rvv_hal { namespace imgproc {

-namespace cv { namespace cv_hal_rvv {
+#if CV_HAL_RVV_1P0_ENABLED

-namespace imageMoments {
-#undef cv_hal_imageMoments
-#define cv_hal_imageMoments cv::cv_hal_rvv::imageMoments::imageMoments
+namespace {

 class MomentsInvoker : public ParallelLoopBody
 {
@ -152,9 +149,11 @@ static inline int imageMoments(int start, int end, const uchar* src_data, size_t
    return CV_HAL_ERROR_OK;
 }

+} // anonymous
+
 // the algorithm is copied from imgproc/src/moments.cpp,
 // in the function cv::Moments cv::moments
-inline int imageMoments(const uchar* src_data, size_t src_step, int src_type, int width, int height, bool binary, double m[10])
+int imageMoments(const uchar* src_data, size_t src_step, int src_type, int width, int height, bool binary, double m[10])
 {
    if (src_type != CV_16UC1 && src_type != CV_16SC1 && src_type != CV_32FC1 && src_type != CV_64FC1)
        return CV_HAL_ERROR_NOT_IMPLEMENTED;
@ -184,8 +183,7 @@ inline int imageMoments(const uchar* src_data, size_t src_step, int src_type, in

    return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
-} // cv::cv_hal_rvv::imageMoments

-}}
+#endif // CV_HAL_RVV_1P0_ENABLED

-#endif
+}}} // cv::rvv_hal::imgproc
--- a/hal/riscv-rvv/src/imgproc/morph.cpp
+++ b/hal/riscv-rvv/src/imgproc/morph.cpp
@ -0,0 +1,331 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
+
+#include "rvv_hal.hpp"
+#include "common.hpp"
+
+namespace cv { namespace rvv_hal { namespace imgproc {
+
+#if CV_HAL_RVV_1P0_ENABLED
+
+namespace {
+
+struct Morph2D
+{
+    int operation;
+    int src_type;
+    int dst_type;
+    int kernel_type;
+    uchar* kernel_data;
+    size_t kernel_step;
+    int kernel_width;
+    int kernel_height;
+    int anchor_x;
+    int anchor_y;
+    int borderType;
+    const uchar* borderValue;
+};
+
+template<int op> struct rvv;
+template<> struct rvv<CV_HAL_MORPH_ERODE>
+{
+    static inline uchar init() { return std::numeric_limits<uchar>::max(); }
+    static inline uchar mop(uchar a, uchar b) { return a < b ? a : b; }
+    static inline vuint8m4_t vop(vuint8m4_t a, vuint8m4_t b, size_t c) { return __riscv_vminu(a, b, c); }
+    static inline vuint8m4_t vop(vuint8m4_t a, uchar b, size_t c) { return __riscv_vminu(a, b, c); }
+};
+template<> struct rvv<CV_HAL_MORPH_DILATE>
+{
+    static inline uchar init() { return std::numeric_limits<uchar>::min(); }
+    static inline uchar mop(uchar a, uchar b) { return a > b ? a : b; }
+    static inline vuint8m4_t vop(vuint8m4_t a, vuint8m4_t b, size_t c) { return __riscv_vmaxu(a, b, c); }
+    static inline vuint8m4_t vop(vuint8m4_t a, uchar b, size_t c) { return __riscv_vmaxu(a, b, c); }
+};
+
+// the algorithm is copied from 3rdparty/carotene/src/morph.cpp,
+// in the function template void morph3x3
+template<int op>
+static inline int morph(int start, int end, Morph2D* data, const uchar* src_data, size_t src_step, uchar* dst_data, int width, int height, int full_width, int full_height, int offset_x, int offset_y)
+{
+    bool kernel[9];
+    for (int i = 0; i < 9; i++)
+    {
+        kernel[i] = data->kernel_data[(i / 3) * data->kernel_step + i % 3] != 0;
+    }
+
+    constexpr int noval = std::numeric_limits<int>::max();
+    auto access = [&](int x, int y) {
+        int pi, pj;
+        if (data->borderType & BORDER_ISOLATED)
+        {
+            pi = common::borderInterpolate(x - data->anchor_y, height, data->borderType & ~BORDER_ISOLATED);
+            pj = common::borderInterpolate(y - data->anchor_x, width , data->borderType & ~BORDER_ISOLATED);
+            pi = pi < 0 ? noval : pi;
+            pj = pj < 0 ? noval : pj;
+        }
+        else
+        {
+            pi = common::borderInterpolate(offset_y + x - data->anchor_y, full_height, data->borderType);
+            pj = common::borderInterpolate(offset_x + y - data->anchor_x, full_width , data->borderType);
+            pi = pi < 0 ? noval : pi - offset_y;
+            pj = pj < 0 ? noval : pj - offset_x;
+        }
+        return std::make_pair(pi, pj);
+    };
+
+    auto process = [&](int x, int y) {
+        if (data->src_type == CV_8UC1)
+        {
+            uchar val = rvv<op>::init();
+            for (int i = 0; i < 9; i++)
+            {
+                if (kernel[i])
+                {
+                    auto p = access(x + i / 3, y + i % 3);
+                    if (p.first != noval && p.second != noval)
+                    {
+                        val = rvv<op>::mop(val, src_data[p.first * src_step + p.second]);
+                    }
+                    else
+                    {
+                        val = rvv<op>::mop(val, data->borderValue[0]);
+                    }
+                }
+            }
+            dst_data[x * width + y] = val;
+        }
+        else
+        {
+            uchar val0, val1, val2, val3;
+            val0 = val1 = val2 = val3 = rvv<op>::init();
+            for (int i = 0; i < 9; i++)
+            {
+                if (kernel[i])
+                {
+                    auto p = access(x + i / 3, y + i % 3);
+                    if (p.first != noval && p.second != noval)
+                    {
+                        val0 = rvv<op>::mop(val0, src_data[p.first * src_step + p.second * 4    ]);
+                        val1 = rvv<op>::mop(val1, src_data[p.first * src_step + p.second * 4 + 1]);
+                        val2 = rvv<op>::mop(val2, src_data[p.first * src_step + p.second * 4 + 2]);
+                        val3 = rvv<op>::mop(val3, src_data[p.first * src_step + p.second * 4 + 3]);
+                    }
+                    else
+                    {
+                        val0 = rvv<op>::mop(val0, data->borderValue[0]);
+                        val1 = rvv<op>::mop(val1, data->borderValue[1]);
+                        val2 = rvv<op>::mop(val2, data->borderValue[2]);
+                        val3 = rvv<op>::mop(val3, data->borderValue[3]);
+                    }
+                }
+            }
+            dst_data[(x * width + y) * 4    ] = val0;
+            dst_data[(x * width + y) * 4 + 1] = val1;
+            dst_data[(x * width + y) * 4 + 2] = val2;
+            dst_data[(x * width + y) * 4 + 3] = val3;
+        }
+    };
+
+    const int left = data->anchor_x, right = width - (2 - data->anchor_x);
+    for (int i = start; i < end; i++)
+    {
+        if (left >= right)
+        {
+            for (int j = 0; j < width; j++)
+                process(i, j);
+        }
+        else
+        {
+            for (int j = 0; j < left; j++)
+                process(i, j);
+            for (int j = right; j < width; j++)
+                process(i, j);
+
+            const uchar* row0 = access(i    , 0).first == noval ? nullptr : src_data + access(i    , 0).first * src_step;
+            const uchar* row1 = access(i + 1, 0).first == noval ? nullptr : src_data + access(i + 1, 0).first * src_step;
+            const uchar* row2 = access(i + 2, 0).first == noval ? nullptr : src_data + access(i + 2, 0).first * src_step;
+            if (data->src_type == CV_8UC1)
+            {
+                int vl;
+                for (int j = left; j < right; j += vl)
+                {
+                    vl = __riscv_vsetvl_e8m4(right - j);
+                    auto m0 = __riscv_vmv_v_x_u8m4(rvv<op>::init(), vl);
+                    auto loadsrc = [&](const uchar* row, bool k0, bool k1, bool k2) {
+                        if (!row)
+                        {
+                            m0 = rvv<op>::vop(m0, data->borderValue[0], vl);
+                            return;
+                        }
+
+                        const uchar* extra = row + j - data->anchor_x;
+                        auto v0 = __riscv_vle8_v_u8m4(extra, vl);
+
+                        if (k0) m0 = rvv<op>::vop(m0, v0, vl);
+                        v0 = __riscv_vslide1down(v0, extra[vl], vl);
+                        if (k1) m0 = rvv<op>::vop(m0, v0, vl);
+                        if (!k2) return;
+                        v0 = __riscv_vslide1down(v0, extra[vl + 1], vl);
+                        m0 = rvv<op>::vop(m0, v0, vl);
+                    };
+
+                    loadsrc(row0, kernel[0], kernel[1], kernel[2]);
+                    loadsrc(row1, kernel[3], kernel[4], kernel[5]);
+                    loadsrc(row2, kernel[6], kernel[7], kernel[8]);
+                    __riscv_vse8(dst_data + i * width + j, m0, vl);
+                }
+            }
+            else
+            {
+                int vl, vl0, vl1;
+                for (int j = left; j < right; j += vl)
+                {
+                    vl = __riscv_vsetvl_e8m4(right - j);
+                    vl0 = std::min(vl, (int)__riscv_vlenb() * 2);
+                    vl1 = vl - vl0;
+                    auto m0 = __riscv_vmv_v_x_u8m4(rvv<op>::init(), vl);
+                    auto m1 = __riscv_vmv_v_x_u8m4(rvv<op>::init(), vl);
+                    auto m2 = __riscv_vmv_v_x_u8m4(rvv<op>::init(), vl);
+                    auto m3 = __riscv_vmv_v_x_u8m4(rvv<op>::init(), vl);
+
+                    auto opshift = [&](vuint8m4_t a, vuint8m4_t b, bool k0, bool k1, bool k2, uchar r1, uchar r2) {
+                        if (k0) a = rvv<op>::vop(a, b, vl);
+                        b = __riscv_vslide1down(b, r1, vl);
+                        if (k1) a = rvv<op>::vop(a, b, vl);
+                        if (!k2) return a;
+                        b = __riscv_vslide1down(b, r2, vl);
+                        return rvv<op>::vop(a, b, vl);
+                    };
+                    auto loadsrc = [&](const uchar* row, bool k0, bool k1, bool k2) {
+                        if (!row)
+                        {
+                            m0 = rvv<op>::vop(m0, data->borderValue[0], vl);
+                            m1 = rvv<op>::vop(m1, data->borderValue[1], vl);
+                            m2 = rvv<op>::vop(m2, data->borderValue[2], vl);
+                            m3 = rvv<op>::vop(m3, data->borderValue[3], vl);
+                            return;
+                        }
+
+                        vuint8m4_t v0{}, v1{}, v2{}, v3{};
+                        const uchar* extra = row + (j - data->anchor_x) * 4;
+                        auto src = __riscv_vlseg4e8_v_u8m2x4(extra, vl0);
+                        v0 = __riscv_vset_v_u8m2_u8m4(v0, 0, __riscv_vget_v_u8m2x4_u8m2(src, 0));
+                        v1 = __riscv_vset_v_u8m2_u8m4(v1, 0, __riscv_vget_v_u8m2x4_u8m2(src, 1));
+                        v2 = __riscv_vset_v_u8m2_u8m4(v2, 0, __riscv_vget_v_u8m2x4_u8m2(src, 2));
+                        v3 = __riscv_vset_v_u8m2_u8m4(v3, 0, __riscv_vget_v_u8m2x4_u8m2(src, 3));
+                        src = __riscv_vlseg4e8_v_u8m2x4(extra + vl0 * 4, vl1);
+                        v0 = __riscv_vset_v_u8m2_u8m4(v0, 1, __riscv_vget_v_u8m2x4_u8m2(src, 0));
+                        v1 = __riscv_vset_v_u8m2_u8m4(v1, 1, __riscv_vget_v_u8m2x4_u8m2(src, 1));
+                        v2 = __riscv_vset_v_u8m2_u8m4(v2, 1, __riscv_vget_v_u8m2x4_u8m2(src, 2));
+                        v3 = __riscv_vset_v_u8m2_u8m4(v3, 1, __riscv_vget_v_u8m2x4_u8m2(src, 3));
+
+                        extra += vl * 4;
+                        m0 = opshift(m0, v0, k0, k1, k2, extra[0], extra[4]);
+                        m1 = opshift(m1, v1, k0, k1, k2, extra[1], extra[5]);
+                        m2 = opshift(m2, v2, k0, k1, k2, extra[2], extra[6]);
+                        m3 = opshift(m3, v3, k0, k1, k2, extra[3], extra[7]);
+                    };
+
+                    loadsrc(row0, kernel[0], kernel[1], kernel[2]);
+                    loadsrc(row1, kernel[3], kernel[4], kernel[5]);
+                    loadsrc(row2, kernel[6], kernel[7], kernel[8]);
+                    vuint8m2x4_t val{};
+                    val = __riscv_vset_v_u8m2_u8m2x4(val, 0, __riscv_vget_v_u8m4_u8m2(m0, 0));
+                    val = __riscv_vset_v_u8m2_u8m2x4(val, 1, __riscv_vget_v_u8m4_u8m2(m1, 0));
+                    val = __riscv_vset_v_u8m2_u8m2x4(val, 2, __riscv_vget_v_u8m4_u8m2(m2, 0));
+                    val = __riscv_vset_v_u8m2_u8m2x4(val, 3, __riscv_vget_v_u8m4_u8m2(m3, 0));
+                    __riscv_vsseg4e8(dst_data + (i * width + j) * 4, val, vl0);
+                    val = __riscv_vset_v_u8m2_u8m2x4(val, 0, __riscv_vget_v_u8m4_u8m2(m0, 1));
+                    val = __riscv_vset_v_u8m2_u8m2x4(val, 1, __riscv_vget_v_u8m4_u8m2(m1, 1));
+                    val = __riscv_vset_v_u8m2_u8m2x4(val, 2, __riscv_vget_v_u8m4_u8m2(m2, 1));
+                    val = __riscv_vset_v_u8m2_u8m2x4(val, 3, __riscv_vget_v_u8m4_u8m2(m3, 1));
+                    __riscv_vsseg4e8(dst_data + (i * width + j + vl0) * 4, val, vl1);
+                }
+            }
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+} // anonymous
+
+int morphInit(cvhalFilter2D** context, int operation, int src_type, int dst_type, int /*max_width*/, int /*max_height*/, int kernel_type, uchar* kernel_data, size_t kernel_step, int kernel_width, int kernel_height, int anchor_x, int anchor_y, int borderType, const double borderValue[4], int iterations, bool /*allowSubmatrix*/, bool /*allowInplace*/)
+{
+    if (kernel_type != CV_8UC1 || src_type != dst_type)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    if (src_type != CV_8UC1 && src_type != CV_8UC4)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    if (kernel_width != kernel_height || kernel_width != 3)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    if (iterations != 1)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    if (operation != CV_HAL_MORPH_ERODE && operation != CV_HAL_MORPH_DILATE)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    if ((borderType & ~BORDER_ISOLATED) == BORDER_WRAP)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    uchar* borderV;
+    if (src_type == CV_8UC1)
+    {
+        borderV = new uchar{static_cast<uchar>(borderValue[0])};
+        if (operation == CV_HAL_MORPH_DILATE && borderValue[0] == DBL_MAX)
+            borderV[0] = 0;
+    }
+    else
+    {
+        borderV = new uchar[4]{static_cast<uchar>(borderValue[0]), static_cast<uchar>(borderValue[1]), static_cast<uchar>(borderValue[2]), static_cast<uchar>(borderValue[3])};
+        if (operation == CV_HAL_MORPH_DILATE)
+        {
+            if (borderValue[0] == DBL_MAX)
+                borderV[0] = 0;
+            if (borderValue[1] == DBL_MAX)
+                borderV[1] = 0;
+            if (borderValue[2] == DBL_MAX)
+                borderV[2] = 0;
+            if (borderValue[3] == DBL_MAX)
+                borderV[3] = 0;
+        }
+    }
+
+    anchor_x = anchor_x < 0 ? kernel_width  / 2 : anchor_x;
+    anchor_y = anchor_y < 0 ? kernel_height / 2 : anchor_y;
+    *context = reinterpret_cast<cvhalFilter2D*>(new Morph2D{operation, src_type, dst_type, kernel_type, kernel_data, kernel_step, kernel_width, kernel_height, anchor_x, anchor_y, borderType, borderV});
+    return CV_HAL_ERROR_OK;
+}
+
+int morph(cvhalFilter2D* context, uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int src_full_width, int src_full_height, int src_roi_x, int src_roi_y, int /*dst_full_width*/, int /*dst_full_height*/, int /*dst_roi_x*/, int /*dst_roi_y*/)
+{
+    Morph2D* data = reinterpret_cast<Morph2D*>(context);
+    int cn = data->src_type == CV_8UC1 ? 1 : 4;
+    std::vector<uchar> dst(width * height * cn);
+
+    int res = CV_HAL_ERROR_NOT_IMPLEMENTED;
+    switch (data->operation)
+    {
+    case CV_HAL_MORPH_ERODE:
+        res = common::invoke(height, {morph<CV_HAL_MORPH_ERODE>}, data, src_data, src_step, dst.data(), width, height, src_full_width, src_full_height, src_roi_x, src_roi_y);
+        break;
+    case CV_HAL_MORPH_DILATE:
+        res = common::invoke(height, {morph<CV_HAL_MORPH_DILATE>}, data, src_data, src_step, dst.data(), width, height, src_full_width, src_full_height, src_roi_x, src_roi_y);
+        break;
+    }
+
+    for (int i = 0; i < height; i++)
+        memcpy(dst_data + i * dst_step, dst.data() + i * width * cn, width * cn);
+    return res;
+}
+
+int morphFree(cvhalFilter2D* context)
+{
+    delete reinterpret_cast<Morph2D*>(context)->borderValue;
+    delete reinterpret_cast<Morph2D*>(context);
+    return CV_HAL_ERROR_OK;
+}
+
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+}}} // cv::rvv_hal::imgproc
--- a/hal/riscv-rvv/src/imgproc/pyramids.cpp
+++ b/hal/riscv-rvv/src/imgproc/pyramids.cpp
@ -4,18 +4,13 @@

 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.

-#ifndef OPENCV_HAL_RVV_PYRAMIDS_HPP_INCLUDED
-#define OPENCV_HAL_RVV_PYRAMIDS_HPP_INCLUDED
+#include "rvv_hal.hpp"

-#include <riscv_vector.h>
-#include "hal_rvv_1p0/types.hpp"
+namespace cv { namespace rvv_hal { namespace imgproc {

-namespace cv { namespace cv_hal_rvv { namespace pyramids {
+#if CV_HAL_RVV_1P0_ENABLED

-#undef cv_hal_pyrdown
-#define cv_hal_pyrdown cv::cv_hal_rvv::pyramids::pyrDown
-#undef cv_hal_pyrup
-#define cv_hal_pyrup cv::cv_hal_rvv::pyramids::pyrUp
+namespace {

 template<typename T> struct rvv;

@ -562,7 +557,9 @@ inline int pyrUp(const uchar* src_data, size_t src_step, int src_width, int src_
    return CV_HAL_ERROR_OK;
 }

-inline int pyrDown(const uchar* src_data, size_t src_step, int src_width, int src_height, uchar* dst_data, size_t dst_step, int dst_width, int dst_height, int depth, int cn, int border_type)
+} // anonymous
+
+int pyrDown(const uchar* src_data, size_t src_step, int src_width, int src_height, uchar* dst_data, size_t dst_step, int dst_width, int dst_height, int depth, int cn, int border_type)
 {
    if (border_type == BORDER_CONSTANT || (depth == CV_32F && cn == 1))
        return CV_HAL_ERROR_NOT_IMPLEMENTED;
@ -580,7 +577,7 @@ inline int pyrDown(const uchar* src_data, size_t src_step, int src_width, int sr
    return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }

-inline int pyrUp(const uchar* src_data, size_t src_step, int src_width, int src_height, uchar* dst_data, size_t dst_step, int dst_width, int dst_height, int depth, int cn, int border_type)
+int pyrUp(const uchar* src_data, size_t src_step, int src_width, int src_height, uchar* dst_data, size_t dst_step, int dst_width, int dst_height, int depth, int cn, int border_type)
 {
    if (border_type != BORDER_DEFAULT)
        return CV_HAL_ERROR_NOT_IMPLEMENTED;
@ -598,6 +595,6 @@ inline int pyrUp(const uchar* src_data, size_t src_step, int src_width, int src_
    return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }

-}}}
+#endif // CV_HAL_RVV_1P0_ENABLED

-#endif
+}}} // cv::rvv_hal::imgproc
--- a/hal/riscv-rvv/src/imgproc/resize.cpp
+++ b/hal/riscv-rvv/src/imgproc/resize.cpp
@ -4,17 +4,15 @@

 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.

-#ifndef OPENCV_HAL_RVV_RESIZE_HPP_INCLUDED
-#define OPENCV_HAL_RVV_RESIZE_HPP_INCLUDED
-
-#include <riscv_vector.h>
+#include "rvv_hal.hpp"
+#include "common.hpp"
 #include <list>

-namespace cv { namespace cv_hal_rvv {
+namespace cv { namespace rvv_hal { namespace imgproc {

-namespace resize {
-#undef cv_hal_resize
-#define cv_hal_resize cv::cv_hal_rvv::resize::resize
+#if CV_HAL_RVV_1P0_ENABLED
+
+namespace {

 class ResizeInvoker : public ParallelLoopBody
 {
@ -986,7 +984,9 @@ static inline int resizeArea(int src_type, const uchar *src_data, size_t src_ste
    return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }

-inline int resize(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, int dst_height, double inv_scale_x, double inv_scale_y, int interpolation)
+} // anonymous
+
+int resize(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, int dst_height, double inv_scale_x, double inv_scale_y, int interpolation)
 {
    inv_scale_x = 1 / inv_scale_x;
    inv_scale_y = 1 / inv_scale_y;
@ -999,8 +999,7 @@ inline int resize(int src_type, const uchar *src_data, size_t src_step, int src_

    return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
-} // cv::cv_hal_rvv::resize

-}}
+#endif // CV_HAL_RVV_1P0_ENABLED

-#endif
+}}} // cv::rvv_hal::imgproc
--- a/hal/riscv-rvv/src/imgproc/sep_filter.cpp
+++ b/hal/riscv-rvv/src/imgproc/sep_filter.cpp
@ -0,0 +1,259 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
+
+#include "rvv_hal.hpp"
+#include "common.hpp"
+
+namespace cv { namespace rvv_hal { namespace imgproc {
+
+#if CV_HAL_RVV_1P0_ENABLED
+
+namespace {
+
+struct sepFilter2D
+{
+    int src_type;
+    int dst_type;
+    int kernel_type;
+    const uchar* kernelx_data;
+    int kernelx_length;
+    const uchar* kernely_data;
+    int kernely_length;
+    int anchor_x;
+    int anchor_y;
+    double delta;
+    int borderType;
+};
+
+// the algorithm is copied from 3rdparty/carotene/src/separable_filter.hpp,
+// in the functor RowFilter3x3S16Generic and ColFilter3x3S16Generic
+template<int ksize, typename T>
+static inline int sepFilter(int start, int end, sepFilter2D* data, const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int full_width, int full_height, int offset_x, int offset_y)
+{
+    constexpr int noval = std::numeric_limits<int>::max();
+    auto accessX = [&](int x) {
+        int pi;
+        if (data->borderType & BORDER_ISOLATED)
+        {
+            pi = common::borderInterpolate(x - data->anchor_y, height, data->borderType & ~BORDER_ISOLATED);
+            pi = pi < 0 ? noval : pi;
+        }
+        else
+        {
+            pi = common::borderInterpolate(offset_y + x - data->anchor_y, full_height, data->borderType);
+            pi = pi < 0 ? noval : pi - offset_y;
+        }
+        return pi;
+    };
+    auto accessY = [&](int y) {
+        int pj;
+        if (data->borderType & BORDER_ISOLATED)
+        {
+            pj = common::borderInterpolate(y - data->anchor_x, width, data->borderType & ~BORDER_ISOLATED);
+            pj = pj < 0 ? noval : pj;
+        }
+        else
+        {
+            pj = common::borderInterpolate(offset_x + y - data->anchor_x, full_width, data->borderType);
+            pj = pj < 0 ? noval : pj - offset_x;
+        }
+        return pj;
+    };
+    auto p2idx = [&](int x, int y){ return (x + ksize) % ksize * width + y; };
+
+    const float* kx = reinterpret_cast<const float*>(data->kernelx_data);
+    const float* ky = reinterpret_cast<const float*>(data->kernely_data);
+    std::vector<float> res(width * ksize);
+    auto process = [&](int x, int y) {
+        float sum = 0;
+        for (int i = 0; i < ksize; i++)
+        {
+            int p = accessY(y + i);
+            if (p != noval)
+            {
+                sum += kx[i] * reinterpret_cast<const T*>(src_data + x * src_step)[p];
+            }
+        }
+        res[p2idx(x, y)] = sum;
+    };
+
+    const int left = data->anchor_x, right = width - (ksize - 1 - data->anchor_x);
+    for (int i = start - data->anchor_y; i < end + (ksize - 1 - data->anchor_y); i++)
+    {
+        if (i + offset_y >= 0 && i + offset_y < full_height)
+        {
+            if (left >= right)
+            {
+                for (int j = 0; j < width; j++)
+                    process(i, j);
+            }
+            else
+            {
+                for (int j = 0; j < left; j++)
+                    process(i, j);
+                for (int j = right; j < width; j++)
+                    process(i, j);
+
+                int vl;
+                for (int j = left; j < right; j += vl)
+                {
+                    vl = __riscv_vsetvl_e8m2(right - j);
+                    const T* extra = reinterpret_cast<const T*>(src_data + i * src_step) + j - data->anchor_x;
+                    vfloat32m8_t src;
+                    if (std::is_same<T, uchar>::value)
+                    {
+                        src = __riscv_vfwcvt_f(__riscv_vwcvtu_x(__riscv_vle8_v_u8m2(reinterpret_cast<const uchar*>(extra), vl), vl), vl);
+                    }
+                    else if (std::is_same<T, short>::value)
+                    {
+                        src = __riscv_vfwcvt_f(__riscv_vle16_v_i16m4(reinterpret_cast<const short*>(extra), vl), vl);
+                    }
+                    else
+                    {
+                        src = __riscv_vle32_v_f32m8(reinterpret_cast<const float*>(extra), vl);
+                    }
+
+                    extra += vl;
+                    auto sum = __riscv_vfmul(src, kx[0], vl);
+                    src = __riscv_vfslide1down(src, extra[0], vl);
+                    sum = __riscv_vfmacc(sum, kx[1], src, vl);
+                    src = __riscv_vfslide1down(src, extra[1], vl);
+                    sum = __riscv_vfmacc(sum, kx[2], src, vl);
+                    if (ksize == 5)
+                    {
+                        src = __riscv_vfslide1down(src, extra[2], vl);
+                        sum = __riscv_vfmacc(sum, kx[3], src, vl);
+                        src = __riscv_vfslide1down(src, extra[3], vl);
+                        sum = __riscv_vfmacc(sum, kx[4], src, vl);
+                    }
+                    __riscv_vse32(res.data() + p2idx(i, j), sum, vl);
+                }
+            }
+        }
+
+        int cur = i - (ksize - 1 - data->anchor_y);
+        if (cur >= start)
+        {
+            const float* row0 = accessX(cur    ) == noval ? nullptr : res.data() + p2idx(accessX(cur    ), 0);
+            const float* row1 = accessX(cur + 1) == noval ? nullptr : res.data() + p2idx(accessX(cur + 1), 0);
+            const float* row2 = accessX(cur + 2) == noval ? nullptr : res.data() + p2idx(accessX(cur + 2), 0);
+            const float* row3 = nullptr, *row4 = nullptr;
+            if (ksize == 5)
+            {
+                row3 = accessX(cur + 3) == noval ? nullptr : res.data() + p2idx(accessX(cur + 3), 0);
+                row4 = accessX(cur + 4) == noval ? nullptr : res.data() + p2idx(accessX(cur + 4), 0);
+            }
+
+            int vl;
+            for (int j = 0; j < width; j += vl)
+            {
+                vl = __riscv_vsetvl_e32m4(width - j);
+                auto v0 = row0 ? __riscv_vle32_v_f32m4(row0 + j, vl) : __riscv_vfmv_v_f_f32m4(0, vl);
+                auto v1 = row1 ? __riscv_vle32_v_f32m4(row1 + j, vl) : __riscv_vfmv_v_f_f32m4(0, vl);
+                auto v2 = row2 ? __riscv_vle32_v_f32m4(row2 + j, vl) : __riscv_vfmv_v_f_f32m4(0, vl);
+                auto sum = __riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmv_v_f_f32m4(data->delta, vl), ky[0], v0, vl), ky[1], v1, vl), ky[2], v2, vl);
+
+                if (ksize == 5)
+                {
+                    auto v3 = row3 ? __riscv_vle32_v_f32m4(row3 + j, vl) : __riscv_vfmv_v_f_f32m4(0, vl);
+                    auto v4 = row4 ? __riscv_vle32_v_f32m4(row4 + j, vl) : __riscv_vfmv_v_f_f32m4(0, vl);
+                    sum = __riscv_vfmacc(__riscv_vfmacc(sum, ky[3], v3, vl), ky[4], v4, vl);
+                }
+
+                if (data->dst_type == CV_16SC1)
+                {
+                    __riscv_vse16(reinterpret_cast<short*>(dst_data + cur * dst_step) + j, __riscv_vfncvt_x(sum, vl), vl);
+                }
+                else
+                {
+                    __riscv_vse32(reinterpret_cast<float*>(dst_data + cur * dst_step) + j, sum, vl);
+                }
+            }
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+} // anonymous
+
+int sepFilterInit(cvhalFilter2D **context, int src_type, int dst_type, int kernel_type, uchar* kernelx_data, int kernelx_length, uchar* kernely_data, int kernely_length, int anchor_x, int anchor_y, double delta, int borderType)
+{
+    if (kernel_type != CV_32FC1)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    if (src_type != CV_8UC1 && src_type != CV_16SC1 && src_type != CV_32FC1)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    if (dst_type != CV_16SC1 && dst_type != CV_32FC1)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    if ((kernelx_length != 3 && kernelx_length != 5) || kernelx_length != kernely_length)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    if ((borderType & ~BORDER_ISOLATED) == BORDER_WRAP)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    anchor_x = anchor_x < 0 ? kernelx_length / 2 : anchor_x;
+    anchor_y = anchor_y < 0 ? kernely_length / 2 : anchor_y;
+    *context = reinterpret_cast<cvhalFilter2D*>(new sepFilter2D{src_type, dst_type, kernel_type, kernelx_data, kernelx_length, kernely_data, kernely_length, anchor_x, anchor_y, delta, borderType & ~BORDER_ISOLATED});
+    return CV_HAL_ERROR_OK;
+}
+
+int sepFilter(cvhalFilter2D *context, uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int full_width, int full_height, int offset_x, int offset_y)
+{
+    sepFilter2D* data = reinterpret_cast<sepFilter2D*>(context);
+
+    uchar* _dst_data = dst_data;
+    size_t _dst_step = dst_step;
+    const size_t size = CV_ELEM_SIZE(data->dst_type);
+    std::vector<uchar> dst;
+    if (src_data == _dst_data)
+    {
+        dst = std::vector<uchar>(width * height * size);
+        dst_data = dst.data();
+        dst_step = width * size;
+    }
+
+    int res = CV_HAL_ERROR_NOT_IMPLEMENTED;
+    switch (data->kernelx_length*100 + data->src_type)
+    {
+    case 300 + CV_8UC1:
+        res = common::invoke(height, {sepFilter<3, uchar>}, data, src_data, src_step, dst_data, dst_step, width, height, full_width, full_height, offset_x, offset_y);
+        break;
+    case 500 + CV_8UC1:
+        res = common::invoke(height, {sepFilter<5, uchar>}, data, src_data, src_step, dst_data, dst_step, width, height, full_width, full_height, offset_x, offset_y);
+        break;
+    case 300 + CV_16SC1:
+        res = common::invoke(height, {sepFilter<3, short>}, data, src_data, src_step, dst_data, dst_step, width, height, full_width, full_height, offset_x, offset_y);
+        break;
+    case 500 + CV_16SC1:
+        res = common::invoke(height, {sepFilter<5, short>}, data, src_data, src_step, dst_data, dst_step, width, height, full_width, full_height, offset_x, offset_y);
+        break;
+    case 300 + CV_32FC1:
+        res = common::invoke(height, {sepFilter<3, float>}, data, src_data, src_step, dst_data, dst_step, width, height, full_width, full_height, offset_x, offset_y);
+        break;
+    case 500 + CV_32FC1:
+        res = common::invoke(height, {sepFilter<5, float>}, data, src_data, src_step, dst_data, dst_step, width, height, full_width, full_height, offset_x, offset_y);
+        break;
+    }
+    if (res == CV_HAL_ERROR_NOT_IMPLEMENTED)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    if (src_data == _dst_data)
+    {
+        for (int i = 0; i < height; i++)
+            memcpy(_dst_data + i * _dst_step, dst.data() + i * dst_step, dst_step);
+    }
+
+    return res;
+}
+
+int sepFilterFree(cvhalFilter2D* context)
+{
+    delete reinterpret_cast<sepFilter2D*>(context);
+    return CV_HAL_ERROR_OK;
+}
+
+#endif // CV_HAL_RVV_1P0_ENABLED
+
+}}} // cv::rvv_hal::imgproc
--- a/hal/riscv-rvv/src/imgproc/threshold.cpp
+++ b/hal/riscv-rvv/src/imgproc/threshold.cpp
@ -4,18 +4,15 @@

 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.

-#ifndef OPENCV_HAL_RVV_THRESH_HPP_INCLUDED
-#define OPENCV_HAL_RVV_THRESH_HPP_INCLUDED
-
-#include <riscv_vector.h>
+#include "rvv_hal.hpp"
+#include "common.hpp"
 #include <atomic>

-namespace cv { namespace cv_hal_rvv {
+namespace cv { namespace rvv_hal { namespace imgproc {

-namespace threshold {
-// disabled since UI is fast enough, only called in threshold_otsu
-// #undef cv_hal_threshold
-// #define cv_hal_threshold cv::cv_hal_rvv::threshold::threshold
+#if CV_HAL_RVV_1P0_ENABLED
+
+namespace {

 class ThresholdInvoker : public ParallelLoopBody
 {
@ -182,16 +179,6 @@ static inline int threshold_range(int start, int end, const uchar* src_data, siz
    return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }

-inline int threshold(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int depth, int cn, double thresh, double maxValue, int thresholdType)
-{
-    return threshold_range(0, height, src_data, src_step, dst_data, dst_step, width, depth, cn, thresh, maxValue, thresholdType);
-}
-} // cv::cv_hal_rvv::threshold
-
-namespace threshold_otsu {
-#undef cv_hal_threshold_otsu
-#define cv_hal_threshold_otsu cv::cv_hal_rvv::threshold_otsu::threshold_otsu
-
 static inline int otsu(int start, int end, const uchar* src_data, size_t src_step, int width, std::atomic<int>* cnt, int N, int* h)
 {
    const int c = cnt->fetch_add(1) % cv::getNumThreads();
@ -205,69 +192,6 @@ static inline int otsu(int start, int end, const uchar* src_data, size_t src_ste
    return CV_HAL_ERROR_OK;
 }

-// the algorithm is copied from imgproc/src/thresh.cpp,
-// in the function template static double getThreshVal_Otsu
-inline int threshold_otsu(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int depth, double maxValue, int thresholdType, double* thresh)
-{
-    if (depth != CV_8UC1 || width * height < (1 << 15))
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-
-    const int N = std::numeric_limits<uchar>::max() + 1;
-    const int nums = cv::getNumThreads();
-    std::vector<int> _h(N * nums, 0);
-    int* h = _h.data();
-
-    std::atomic<int> cnt(0);
-    cv::parallel_for_(Range(0, height), threshold::ThresholdInvoker({otsu}, src_data, src_step, width, &cnt, N, h), nums);
-    for (int i = N; i < nums * N; i++)
-    {
-        h[i % N] += h[i];
-    }
-
-    double mu = 0, scale = 1. / (width*height);
-    for (int i = 0; i < N; i++)
-    {
-        mu += i*(double)h[i];
-    }
-
-    mu *= scale;
-    double mu1 = 0, q1 = 0;
-    double max_sigma = 0, max_val = 0;
-
-    for (int i = 0; i < N; i++)
-    {
-        double p_i, q2, mu2, sigma;
-
-        p_i = h[i]*scale;
-        mu1 *= q1;
-        q1 += p_i;
-        q2 = 1. - q1;
-
-        if (std::min(q1,q2) < FLT_EPSILON || std::max(q1,q2) > 1. - FLT_EPSILON)
-            continue;
-
-        mu1 = (mu1 + i*p_i)/q1;
-        mu2 = (mu - q1*mu1)/q2;
-        sigma = q1*q2*(mu1 - mu2)*(mu1 - mu2);
-        if (sigma > max_sigma)
-        {
-            max_sigma = sigma;
-            max_val = i;
-        }
-    }
-
-    *thresh = max_val;
-    if (dst_data == nullptr)
-        return CV_HAL_ERROR_OK;
-
-    return threshold::invoke(width, height, {threshold::threshold_range}, src_data, src_step, dst_data, dst_step, width, depth, 1, max_val, maxValue, thresholdType);
-}
-} // cv::cv_hal_rvv::threshold_otsu
-
-namespace adaptiveThreshold {
-#undef cv_hal_adaptiveThreshold
-#define cv_hal_adaptiveThreshold cv::cv_hal_rvv::adaptiveThreshold::adaptiveThreshold
-
 // the algorithm is copied from imgproc/src/thresh.cpp,
 // in the function void cv::adaptiveThreshold
 template<int ksize, int method, int type>
@ -444,7 +368,72 @@ static inline int adaptiveThreshold(int start, int end, const uchar* src_data, s
    return CV_HAL_ERROR_OK;
 }

-inline int adaptiveThreshold(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, double maxValue, int adaptiveMethod, int thresholdType, int blockSize, double C)
+} // anonymous
+
+int threshold(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int depth, int cn, double thresh, double maxValue, int thresholdType)
+{
+    return threshold_range(0, height, src_data, src_step, dst_data, dst_step, width, depth, cn, thresh, maxValue, thresholdType);
+}
+
+// the algorithm is copied from imgproc/src/thresh.cpp,
+// in the function template static double getThreshVal_Otsu
+int threshold_otsu(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, int depth, double maxValue, int thresholdType, double* thresh)
+{
+    if (depth != CV_8UC1 || width * height < (1 << 15))
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    const int N = std::numeric_limits<uchar>::max() + 1;
+    const int nums = cv::getNumThreads();
+    std::vector<int> _h(N * nums, 0);
+    int* h = _h.data();
+
+    std::atomic<int> cnt(0);
+    cv::parallel_for_(Range(0, height), ThresholdInvoker({otsu}, src_data, src_step, width, &cnt, N, h), nums);
+    for (int i = N; i < nums * N; i++)
+    {
+        h[i % N] += h[i];
+    }
+
+    double mu = 0, scale = 1. / (width*height);
+    for (int i = 0; i < N; i++)
+    {
+        mu += i*(double)h[i];
+    }
+
+    mu *= scale;
+    double mu1 = 0, q1 = 0;
+    double max_sigma = 0, max_val = 0;
+
+    for (int i = 0; i < N; i++)
+    {
+        double p_i, q2, mu2, sigma;
+
+        p_i = h[i]*scale;
+        mu1 *= q1;
+        q1 += p_i;
+        q2 = 1. - q1;
+
+        if (std::min(q1,q2) < FLT_EPSILON || std::max(q1,q2) > 1. - FLT_EPSILON)
+            continue;
+
+        mu1 = (mu1 + i*p_i)/q1;
+        mu2 = (mu - q1*mu1)/q2;
+        sigma = q1*q2*(mu1 - mu2)*(mu1 - mu2);
+        if (sigma > max_sigma)
+        {
+            max_sigma = sigma;
+            max_val = i;
+        }
+    }
+
+    *thresh = max_val;
+    if (dst_data == nullptr)
+        return CV_HAL_ERROR_OK;
+
+    return invoke(width, height, {threshold_range}, src_data, src_step, dst_data, dst_step, width, depth, 1, max_val, maxValue, thresholdType);
+}
+
+int adaptiveThreshold(const uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, int width, int height, double maxValue, int adaptiveMethod, int thresholdType, int blockSize, double C)
 {
    if (thresholdType != CV_HAL_THRESH_BINARY && thresholdType != CV_HAL_THRESH_BINARY_INV)
        return CV_HAL_ERROR_NOT_IMPLEMENTED;
@ -456,27 +445,26 @@ inline int adaptiveThreshold(const uchar* src_data, size_t src_step, uchar* dst_
    switch (blockSize*100 + adaptiveMethod*10 + thresholdType)
    {
    case 300 + CV_HAL_ADAPTIVE_THRESH_MEAN_C*10 + CV_HAL_THRESH_BINARY:
-        return threshold::invoke(width, height, {adaptiveThreshold<3, CV_HAL_ADAPTIVE_THRESH_MEAN_C, CV_HAL_THRESH_BINARY>}, src_data, src_step, dst_data, dst_step, width, height, maxValue, C);
+        return invoke(width, height, {adaptiveThreshold<3, CV_HAL_ADAPTIVE_THRESH_MEAN_C, CV_HAL_THRESH_BINARY>}, src_data, src_step, dst_data, dst_step, width, height, maxValue, C);
    case 300 + CV_HAL_ADAPTIVE_THRESH_MEAN_C*10 + CV_HAL_THRESH_BINARY_INV:
-        return threshold::invoke(width, height, {adaptiveThreshold<3, CV_HAL_ADAPTIVE_THRESH_MEAN_C, CV_HAL_THRESH_BINARY_INV>}, src_data, src_step, dst_data, dst_step, width, height, maxValue, C);
+        return invoke(width, height, {adaptiveThreshold<3, CV_HAL_ADAPTIVE_THRESH_MEAN_C, CV_HAL_THRESH_BINARY_INV>}, src_data, src_step, dst_data, dst_step, width, height, maxValue, C);
    case 500 + CV_HAL_ADAPTIVE_THRESH_MEAN_C*10 + CV_HAL_THRESH_BINARY:
-        return threshold::invoke(width, height, {adaptiveThreshold<5, CV_HAL_ADAPTIVE_THRESH_MEAN_C, CV_HAL_THRESH_BINARY>}, src_data, src_step, dst_data, dst_step, width, height, maxValue, C);
+        return invoke(width, height, {adaptiveThreshold<5, CV_HAL_ADAPTIVE_THRESH_MEAN_C, CV_HAL_THRESH_BINARY>}, src_data, src_step, dst_data, dst_step, width, height, maxValue, C);
    case 500 + CV_HAL_ADAPTIVE_THRESH_MEAN_C*10 + CV_HAL_THRESH_BINARY_INV:
-        return threshold::invoke(width, height, {adaptiveThreshold<5, CV_HAL_ADAPTIVE_THRESH_MEAN_C, CV_HAL_THRESH_BINARY_INV>}, src_data, src_step, dst_data, dst_step, width, height, maxValue, C);
+        return invoke(width, height, {adaptiveThreshold<5, CV_HAL_ADAPTIVE_THRESH_MEAN_C, CV_HAL_THRESH_BINARY_INV>}, src_data, src_step, dst_data, dst_step, width, height, maxValue, C);
    case 300 + CV_HAL_ADAPTIVE_THRESH_GAUSSIAN_C*10 + CV_HAL_THRESH_BINARY:
-        return threshold::invoke(width, height, {adaptiveThreshold<3, CV_HAL_ADAPTIVE_THRESH_GAUSSIAN_C, CV_HAL_THRESH_BINARY>}, src_data, src_step, dst_data, dst_step, width, height, maxValue, C);
+        return invoke(width, height, {adaptiveThreshold<3, CV_HAL_ADAPTIVE_THRESH_GAUSSIAN_C, CV_HAL_THRESH_BINARY>}, src_data, src_step, dst_data, dst_step, width, height, maxValue, C);
    case 300 + CV_HAL_ADAPTIVE_THRESH_GAUSSIAN_C*10 + CV_HAL_THRESH_BINARY_INV:
-        return threshold::invoke(width, height, {adaptiveThreshold<3, CV_HAL_ADAPTIVE_THRESH_GAUSSIAN_C, CV_HAL_THRESH_BINARY_INV>}, src_data, src_step, dst_data, dst_step, width, height, maxValue, C);
+        return invoke(width, height, {adaptiveThreshold<3, CV_HAL_ADAPTIVE_THRESH_GAUSSIAN_C, CV_HAL_THRESH_BINARY_INV>}, src_data, src_step, dst_data, dst_step, width, height, maxValue, C);
    case 500 + CV_HAL_ADAPTIVE_THRESH_GAUSSIAN_C*10 + CV_HAL_THRESH_BINARY:
-        return threshold::invoke(width, height, {adaptiveThreshold<5, CV_HAL_ADAPTIVE_THRESH_GAUSSIAN_C, CV_HAL_THRESH_BINARY>}, src_data, src_step, dst_data, dst_step, width, height, maxValue, C);
+        return invoke(width, height, {adaptiveThreshold<5, CV_HAL_ADAPTIVE_THRESH_GAUSSIAN_C, CV_HAL_THRESH_BINARY>}, src_data, src_step, dst_data, dst_step, width, height, maxValue, C);
    case 500 + CV_HAL_ADAPTIVE_THRESH_GAUSSIAN_C*10 + CV_HAL_THRESH_BINARY_INV:
-        return threshold::invoke(width, height, {adaptiveThreshold<5, CV_HAL_ADAPTIVE_THRESH_GAUSSIAN_C, CV_HAL_THRESH_BINARY_INV>}, src_data, src_step, dst_data, dst_step, width, height, maxValue, C);
+        return invoke(width, height, {adaptiveThreshold<5, CV_HAL_ADAPTIVE_THRESH_GAUSSIAN_C, CV_HAL_THRESH_BINARY_INV>}, src_data, src_step, dst_data, dst_step, width, height, maxValue, C);
    }

    return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
-} // cv::cv_hal_rvv::adaptiveThreshold

-}}
+#endif // CV_HAL_RVV_1P0_ENABLED

-#endif
+}}} /// cv::rvv_hal::imgproc
--- a/hal/riscv-rvv/src/imgproc/warp.cpp
+++ b/hal/riscv-rvv/src/imgproc/warp.cpp
@ -4,22 +4,14 @@

 // Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.

-#ifndef OPENCV_HAL_RVV_WARP_HPP_INCLUDED
-#define OPENCV_HAL_RVV_WARP_HPP_INCLUDED
+#include "rvv_hal.hpp"
+#include "common.hpp"

-#include <riscv_vector.h>
+namespace cv { namespace rvv_hal { namespace imgproc {

-namespace cv { namespace cv_hal_rvv {
+#if CV_HAL_RVV_1P0_ENABLED

-namespace remap {
-
-// BUG: https://github.com/opencv/opencv/issues/27279
-// #undef cv_hal_remap32f
-// #define cv_hal_remap32f cv::cv_hal_rvv::remap::remap32f
-// #undef cv_hal_remap32fc2
-// #define cv_hal_remap32fc2 cv::cv_hal_rvv::remap::remap32fc2
-// #undef cv_hal_remap16s
-// #define cv_hal_remap16s cv::cv_hal_rvv::remap::remap16s
+namespace {

 class RemapInvoker : public ParallelLoopBody
 {
@ -794,6 +786,8 @@ static inline int remap32fC4(int start, int end, const uchar *src_data, size_t s
    return CV_HAL_ERROR_OK;
 }

+} // anonymous
+
 // the algorithm is copied from 3rdparty/carotene/src/remap.cpp,
 // in the function void CAROTENE_NS::remapNearestNeighbor and void CAROTENE_NS::remapLinear
 template<bool s16 = false>
@ -880,17 +874,6 @@ inline int remap16s(int src_type, const uchar *src_data, size_t src_step, int sr
        return CV_HAL_ERROR_NOT_IMPLEMENTED;
    return remap32f<true>(src_type, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, dst_height, reinterpret_cast<float*>(mapx), mapx_step, reinterpret_cast<float*>(mapy), mapy_step, interpolation, border_type, border_value);
 }
-} // cv::cv_hal_rvv::remap
-
-namespace warp {
-
-// BUG: https://github.com/opencv/opencv/issues/27280
-//#undef cv_hal_warpAffine
-//#define cv_hal_warpAffine cv::cv_hal_rvv::warp::warpAffine
-
-// BUG: https://github.com/opencv/opencv/issues/27281
-//#undef cv_hal_warpPerspective
-//#define cv_hal_warpPerspective cv::cv_hal_rvv::warp::warpPerspective

 template<bool perspective>
 static inline int warpC1(int start, int end, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, const double* M, int interpolation, int borderType, const double* borderValue)
@ -1162,7 +1145,7 @@ static inline int warpC4(int start, int end, const uchar *src_data, size_t src_s

 // the algorithm is copied from 3rdparty/carotene/src/warp_affine.cpp,
 // in the function void CAROTENE_NS::warpAffineNearestNeighbor and void CAROTENE_NS::warpAffineLinear
-inline int warpAffine(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, int dst_height, const double M[6], int interpolation, int borderType, const double borderValue[4])
+int warpAffine(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, int dst_height, const double M[6], int interpolation, int borderType, const double borderValue[4])
 {
    if (src_type != CV_8UC1 && src_type != CV_8UC3 && src_type != CV_8UC4)
        return CV_HAL_ERROR_NOT_IMPLEMENTED;
@ -1174,11 +1157,11 @@ inline int warpAffine(int src_type, const uchar *src_data, size_t src_step, int
    switch (src_type)
    {
    case CV_8UC1:
-        return remap::invoke(dst_width, dst_height, {warpC1<false>}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, M, interpolation, borderType, borderValue);
+        return invoke(dst_width, dst_height, {warpC1<false>}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, M, interpolation, borderType, borderValue);
    case CV_8UC3:
-        return remap::invoke(dst_width, dst_height, {warpC3<false>}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, M, interpolation, borderType, borderValue);
+        return invoke(dst_width, dst_height, {warpC3<false>}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, M, interpolation, borderType, borderValue);
    case CV_8UC4:
-        return remap::invoke(dst_width, dst_height, {warpC4<false>}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, M, interpolation, borderType, borderValue);
+        return invoke(dst_width, dst_height, {warpC4<false>}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, M, interpolation, borderType, borderValue);
    }

    return CV_HAL_ERROR_NOT_IMPLEMENTED;
@ -1186,7 +1169,7 @@ inline int warpAffine(int src_type, const uchar *src_data, size_t src_step, int

 // the algorithm is copied from 3rdparty/carotene/src/warp_perspective.cpp,
 // in the function void CAROTENE_NS::warpPerspectiveNearestNeighbor and void CAROTENE_NS::warpPerspectiveLinear
-inline int warpPerspective(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, int dst_height, const double M[9], int interpolation, int borderType, const double borderValue[4])
+int warpPerspective(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, int dst_height, const double M[9], int interpolation, int borderType, const double borderValue[4])
 {
    if (src_type != CV_8UC1 && src_type != CV_8UC3 && src_type != CV_8UC4)
        return CV_HAL_ERROR_NOT_IMPLEMENTED;
@ -1198,17 +1181,16 @@ inline int warpPerspective(int src_type, const uchar *src_data, size_t src_step,
    switch (src_type)
    {
    case CV_8UC1:
-        return remap::invoke(dst_width, dst_height, {warpC1<true>}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, M, interpolation, borderType, borderValue);
+        return invoke(dst_width, dst_height, {warpC1<true>}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, M, interpolation, borderType, borderValue);
    case CV_8UC3:
-        return remap::invoke(dst_width, dst_height, {warpC3<true>}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, M, interpolation, borderType, borderValue);
+        return invoke(dst_width, dst_height, {warpC3<true>}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, M, interpolation, borderType, borderValue);
    case CV_8UC4:
-        return remap::invoke(dst_width, dst_height, {warpC4<true>}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, M, interpolation, borderType, borderValue);
+        return invoke(dst_width, dst_height, {warpC4<true>}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, M, interpolation, borderType, borderValue);
    }

    return CV_HAL_ERROR_NOT_IMPLEMENTED;
 }
-} // cv::cv_hal_rvv::warp

-}}
+#endif // CV_HAL_RVV_1P0_ENABLED

-#endif
+}}} // cv::rvv_hal::imgproc
--- a/hal/riscv-rvv/version/hal_rvv_071.hpp
+++ b/hal/riscv-rvv/version/hal_rvv_071.hpp
@ -1,109 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.
-
-#ifndef OPENCV_HAL_RVV_071_HPP_INCLUDED
-#define OPENCV_HAL_RVV_071_HPP_INCLUDED
-
-#include <riscv_vector.h>
-
-#include <limits>
-
-namespace cv { namespace cv_hal_rvv {
-
-#undef cv_hal_cvtBGRtoBGR
-#define cv_hal_cvtBGRtoBGR cv::cv_hal_rvv::cvtBGRtoBGR
-
-static const unsigned char index_array_32 [32]
-                        { 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15, 18, 17, 16, 19, 22, 21, 20, 23, 26, 25, 24, 27, 30, 29, 28, 31  };
-
-static const unsigned char index_array_24 [24]
-                        { 2, 1, 0, 5, 4, 3, 8, 7, 6, 11, 10, 9, 14, 13, 12, 17, 16, 15, 20, 19, 18, 23, 22, 21  };
-
-static void vBGRtoBGR(const unsigned char* src, unsigned char * dst, const unsigned char * index, int n, int scn, int dcn, int vsize_pixels, const int vsize)
-{
-    vuint8m2_t vec_index = vle8_v_u8m2(index, vsize);
-
-    int i = 0;
-
-    for ( ; i <= n-vsize; i += vsize_pixels, src += vsize, dst += vsize)
-    {
-        vuint8m2_t vec_src = vle8_v_u8m2(src, vsize);
-        vuint8m2_t vec_dst = vrgather_vv_u8m2(vec_src, vec_index, vsize);
-        vse8_v_u8m2(dst, vec_dst, vsize);
-    }
-
-    for ( ; i < n; i++, src += scn, dst += dcn )
-    {
-        unsigned char t0 = src[0], t1 = src[1], t2 = src[2];
-        dst[2] = t0;
-        dst[1] = t1;
-        dst[0] = t2;
-        if(dcn == 4)
-        {
-            unsigned char d = src[3];
-            dst[3] = d;
-        }
-    }
-}
-
-static void sBGRtoBGR(const unsigned char* src, unsigned char * dst, int n, int scn, int dcn, int bi)
-{
-    for (int i = 0; i < n; i++, src += scn, dst += dcn)
-    {
-        unsigned char t0 = src[0], t1 = src[1], t2 = src[2];
-        dst[bi  ] = t0;
-        dst[1]    = t1;
-        dst[bi^2] = t2;
-        if(dcn == 4)
-        {
-            unsigned char d = scn == 4 ? src[3] : std::numeric_limits<unsigned char>::max();
-            dst[3] = d;
-        }
-    }
-}
-
-static int cvtBGRtoBGR(const unsigned char * src_data, size_t src_step, unsigned char * dst_data, size_t dst_step, int width, int height, int depth, int scn, int dcn, bool swapBlue)
-{
-    if (depth != CV_8U)
-    {
-        return CV_HAL_ERROR_NOT_IMPLEMENTED;
-    }
-
-    const int blueIdx = swapBlue ? 2 : 0;
-    if (scn == dcn)
-    {
-        if (!swapBlue)
-        {
-            return CV_HAL_ERROR_NOT_IMPLEMENTED;
-        }
-
-        const int vsize_pixels = 8;
-
-        if (scn == 4)
-        {
-            for (int i = 0; i < height; i++, src_data += src_step, dst_data += dst_step)
-            {
-                vBGRtoBGR(src_data, dst_data, index_array_32, width, scn, dcn, vsize_pixels, 32);
-            }
-        }
-        else
-        {
-            for (int i = 0; i < height; i++, src_data += src_step, dst_data += dst_step)
-            {
-                vBGRtoBGR(src_data, dst_data, index_array_24, width, scn, dcn, vsize_pixels, 24);
-            }
-        }
-    }
-    else
-    {
-        for (int i = 0; i < height; i++, src_data += src_step, dst_data += dst_step)
-            sBGRtoBGR(src_data, dst_data, width, scn, dcn, blueIdx);
-    }
-
-    return CV_HAL_ERROR_OK;
-}
-
-}}
-
-#endif
--- a/modules/3d/src/solvepnp.cpp
+++ b/modules/3d/src/solvepnp.cpp
@ -99,7 +99,8 @@ void drawFrameAxes(InputOutputArray image, InputArray cameraMatrix, InputArray d
    CV_CheckType(type, cn == 1 || cn == 3 || cn == 4,
                 "Number of channels must be 1, 3 or 4" );

-    CV_Assert(image.getMat().total() > 0);
+    cv::Mat img = image.getMat();
+    CV_Assert(img.total() > 0);
    CV_Assert(length > 0);

    // project axes points
@ -111,6 +112,18 @@ void drawFrameAxes(InputOutputArray image, InputArray cameraMatrix, InputArray d
    std::vector<Point2f> imagePoints;
    projectPoints(axesPoints, rvec, tvec, cameraMatrix, distCoeffs, imagePoints);

+    cv::Rect imageRect(0, 0, img.cols, img.rows);
+    bool allIn = true;
+    for (size_t i = 0; i < imagePoints.size(); i++)
+    {
+        allIn &= imageRect.contains(imagePoints[i]);
+    }
+
+    if (!allIn)
+    {
+        CV_LOG_WARNING(NULL, "Some of projected axes endpoints are out of frame. The drawn axes may be not relaible.");
+    }
+
    // draw axes lines
    line(image, imagePoints[0], imagePoints[1], Scalar(0, 0, 255), thickness);
    line(image, imagePoints[0], imagePoints[2], Scalar(0, 255, 0), thickness);
--- a/modules/3d/src/usac.hpp
+++ b/modules/3d/src/usac.hpp
@ -17,7 +17,7 @@ class Error : public Algorithm {
 public:
    // set model to use getError() function
    virtual void setModelParameters (const Mat &model) = 0;
-    // returns error of point wih @point_idx w.r.t. model
+    // returns error of point with @point_idx w.r.t. model
    virtual float getError (int point_idx) const = 0;
    virtual const std::vector<float> &getErrors (const Mat &model) = 0;
 };
--- a/modules/calib/test/test_fisheye.cpp
+++ b/modules/calib/test/test_fisheye.cpp
@ -175,7 +175,7 @@ TEST_F(fisheyeTest, CalibrationWithFixedFocalLength)
    cv::fisheye::calibrate(objectPoints, imagePoints, imageSize, theK, theD,
                           cv::noArray(), cv::noArray(), flag, cv::TermCriteria(3, 20, 1e-6));

-    // ensure that CALIB_FIX_FOCAL_LENGTH works and focal lenght has not changed
+    // ensure that CALIB_FIX_FOCAL_LENGTH works and focal length has not changed
    EXPECT_EQ(theK(0,0), K(0,0));
    EXPECT_EQ(theK(1,1), K(1,1));

--- a/modules/core/include/opencv2/core.hpp
+++ b/modules/core/include/opencv2/core.hpp
@ -1965,8 +1965,8 @@ The function solveCubic finds the real roots of a cubic equation:

 The roots are stored in the roots array.
@param coeffs equation coefficients, an array of 3 or 4 elements.
-@param roots output array of real roots that has 1 or 3 elements.
-@return number of real roots. It can be 0, 1 or 2.
+@param roots output array of real roots that has 0, 1, 2 or 3 elements.
+@return number of real roots. It can be -1 (all real numbers), 0, 1, 2 or 3.
 */
 CV_EXPORTS_W int solveCubic(InputArray coeffs, OutputArray roots);

--- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
@ -225,32 +225,30 @@ These operations allow to reorder or recombine elements in one or multiple vecto
 Element-wise binary and unary operations.

 - Arithmetics:
-@ref v_add(const v_reg &a, const v_reg &b) "+",
-@ref v_sub(const v_reg &a, const v_reg &b) "-",
-@ref v_mul(const v_reg &a, const v_reg &b) "*",
-@ref v_div(const v_reg &a, const v_reg &b) "/",
+@ref v_add,
+@ref v_sub,
+@ref v_mul,
+@ref v_div,
@ref v_mul_expand

 - Non-saturating arithmetics: @ref v_add_wrap, @ref v_sub_wrap

 - Bitwise shifts:
-@ref v_shl(const v_reg &a, int s) "<<",
-@ref v_shr(const v_reg &a, int s) ">>",
@ref v_shl, @ref v_shr

 - Bitwise logic:
-@ref v_and(const v_reg &a, const v_reg &b) "&",
-@ref v_or(const v_reg &a, const v_reg &b) "|",
-@ref v_xor(const v_reg &a, const v_reg &b) "^",
-@ref v_not(const v_reg &a) "~"
+@ref v_and,
+@ref v_or,
+@ref v_xor,
+@ref v_not

 - Comparison:
-@ref v_gt(const v_reg &a, const v_reg &b) ">",
-@ref v_ge(const v_reg &a, const v_reg &b) ">=",
-@ref v_lt(const v_reg &a, const v_reg &b) "<",
-@ref v_le(const v_reg &a, const v_reg &b) "<=",
-@ref v_eq(const v_reg &a, const v_reg &b) "==",
-@ref v_ne(const v_reg &a, const v_reg &b) "!="
+@ref v_gt,
+@ref v_ge,
+@ref v_lt,
+@ref v_le,
+@ref v_eq,
+@ref v_ne

 - min/max: @ref v_min, @ref v_max

--- a/modules/core/include/opencv2/core/hal/intrin_legacy_ops.h
+++ b/modules/core/include/opencv2/core/hal/intrin_legacy_ops.h
@ -0,0 +1,111 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+// This file has been created for compatibility with older versions of Universal Intrinscs
+// Binary operators for vector types has been removed since version 4.11
+// Include this file manually after OpenCV headers if you need these operators
+
+#ifndef OPENCV_HAL_INTRIN_LEGACY_OPS_HPP
+#define OPENCV_HAL_INTRIN_LEGACY_OPS_HPP
+
+#ifdef __OPENCV_BUILD
+#error "Universal Intrinsics operators are deprecated and should not be used in OpenCV library"
+#endif
+
+#ifdef __riscv
+#warning "Operators might conflict with built-in functions on RISC-V platform"
+#endif
+
+#if defined(CV_VERSION) && CV_VERSION_MAJOR == 4 && CV_VERSION_MINOR < 9
+#warning "Older versions of OpenCV (<4.9) already have Universal Intrinscs operators"
+#endif
+
+
+namespace cv { namespace hal {
+
+#define BIN_OP(OP, FUN) \
+template <typename R> R operator OP (const R & lhs, const R & rhs) { return FUN(lhs, rhs); }
+
+#define BIN_A_OP(OP, FUN) \
+template <typename R> R & operator OP (R & res, const R & val) { res = FUN(res, val); return res; }
+
+#define UN_OP(OP, FUN) \
+template <typename R> R operator OP (const R & val) { return FUN(val); }
+
+BIN_OP(+, v_add)
+BIN_OP(-, v_sub)
+BIN_OP(*, v_mul)
+BIN_OP(/, v_div)
+BIN_OP(&, v_and)
+BIN_OP(|, v_or)
+BIN_OP(^, v_xor)
+
+BIN_OP(==, v_eq)
+BIN_OP(!=, v_ne)
+BIN_OP(<, v_lt)
+BIN_OP(>, v_gt)
+BIN_OP(<=, v_le)
+BIN_OP(>=, v_ge)
+
+BIN_A_OP(+=, v_add)
+BIN_A_OP(-=, v_sub)
+BIN_A_OP(*=, v_mul)
+BIN_A_OP(/=, v_div)
+BIN_A_OP(&=, v_and)
+BIN_A_OP(|=, v_or)
+BIN_A_OP(^=, v_xor)
+
+UN_OP(~, v_not)
+
+// TODO: shift operators?
+
+}} // cv::hal::
+
+//==============================================================================
+
+#ifdef OPENCV_ENABLE_INLINE_INTRIN_OPERATOR_TEST
+
+namespace cv { namespace hal {
+
+inline static void opencv_operator_compile_test()
+{
+    using namespace cv;
+    v_float32 a, b, c;
+    uint8_t shift = 1;
+    a = b + c;
+    a = b - c;
+    a = b * c;
+    a = b / c;
+    a = b & c;
+    a = b | c;
+    a = b ^ c;
+    // a = b >> shift;
+    // a = b << shift;
+
+    a = (b == c);
+    a = (b != c);
+    a = (b < c);}}
+    a = (b > c);
+    a = (b <= c);
+    a = (b >= c);
+
+    a += b;
+    a -= b;
+    a *= b;
+    a /= b;
+    a &= b;
+    a |= b;
+    a ^= b;
+    // a <<= shift;
+    // a >>= shift;
+
+    a = ~b;
+}
+
+}} // cv::hal::
+
+#endif
+
+
+#endif // OPENCV_HAL_INTRIN_LEGACY_OPS_HPP
--- a/modules/core/include/opencv2/core/mat.inl.hpp
+++ b/modules/core/include/opencv2/core/mat.inl.hpp
@ -3184,6 +3184,12 @@ Mat_<_Tp>& Mat_<_Tp>::operator = (const MatExpr& e)
    return *this;
 }

+template<typename _Tp> inline
+MatExpr Mat_<_Tp>::zeros(int _ndims, const int* _sizes)
+{
+    return Mat::zeros(_ndims, _sizes, traits::Type<_Tp>::value);
+}
+
 template<typename _Tp> inline
 MatExpr Mat_<_Tp>::zeros(int rows, int cols)
 {
--- a/modules/core/include/opencv2/core/private.cuda.hpp
+++ b/modules/core/include/opencv2/core/private.cuda.hpp
@ -147,7 +147,23 @@ namespace cv { namespace cuda
        inline explicit NppStreamHandler(cudaStream_t newStream)
        {
            nppStreamContext = {};
+            #if CUDA_VERSION < 12090
                nppSafeCall(nppGetStreamContext(&nppStreamContext));
+            #else
+                int device = 0;
+                cudaSafeCall(cudaGetDevice(&device));
+
+                cudaDeviceProp prop{};
+                cudaSafeCall(cudaGetDeviceProperties(&prop, device));
+
+                nppStreamContext.nCudaDeviceId = device;
+                nppStreamContext.nMultiProcessorCount = prop.multiProcessorCount;
+                nppStreamContext.nMaxThreadsPerMultiProcessor = prop.maxThreadsPerMultiProcessor;
+                nppStreamContext.nMaxThreadsPerBlock = prop.maxThreadsPerBlock;
+                nppStreamContext.nSharedMemPerBlock = prop.sharedMemPerBlock;
+                nppStreamContext.nCudaDevAttrComputeCapabilityMajor = prop.major;
+                nppStreamContext.nCudaDevAttrComputeCapabilityMinor = prop.minor;
+            #endif
            nppStreamContext.hStream = newStream;
            cudaSafeCall(cudaStreamGetFlags(nppStreamContext.hStream, &nppStreamContext.nStreamFlags));
        }
--- a/modules/core/perf/opencl/perf_arithm.cpp
+++ b/modules/core/perf/opencl/perf_arithm.cpp
@ -694,7 +694,7 @@ OCL_PERF_TEST_P(PowFixture, Pow, ::testing::Combine(

 ///////////// iPow ////////////////////////
 OCL_PERF_TEST_P(PowFixture, iPow, ::testing::Combine(
-                OCL_TEST_SIZES, OCL_PERF_ENUM(CV_8UC1, CV_8SC1,CV_16UC1,CV_16SC1,CV_32SC1)))
+                OCL_TEST_SIZES, OCL_PERF_ENUM(CV_8UC1, CV_8UC3, CV_8SC1, CV_16UC1, CV_16SC1, CV_32SC1, CV_32FC1, CV_64FC1)))
 {
    const Size_MatType_t params = GetParam();
    const Size srcSize = get<0>(params);
@ -706,7 +706,7 @@ OCL_PERF_TEST_P(PowFixture, iPow, ::testing::Combine(
    randu(src, 0, 100);
    declare.in(src).out(dst);

-    OCL_TEST_CYCLE() cv::pow(src, 7.0, dst);
+    OCL_TEST_CYCLE() cv::pow(src, 3, dst);

    SANITY_CHECK_NOTHING();
 }
--- a/modules/core/src/hal_replacement.hpp
+++ b/modules/core/src/hal_replacement.hpp
@ -1223,8 +1223,22 @@ inline int hal_ni_copyToMasked(const uchar* src_data, size_t src_step, uchar* ds
 #define cv_hal_copyToMasked hal_ni_copyToMasked
 //! @endcond

-//! @}
+/**
+ @ brief sum
+ @param src_data Source image data
+ @param src_step Source image step
+ @param src_type Source image type
+ @param width, height Source image dimensions
+ @param result Pointer to save the sum result to.
+ */
+inline int hal_ni_sum(const uchar *src_data, size_t src_step, int src_type, int width, int height, double *result)
+{ return CV_HAL_ERROR_NOT_IMPLEMENTED; }

+//! @cond IGNORED
+#define cv_hal_sum hal_ni_sum
+//! @endcond
+
+//! @}

 #if defined(__clang__)
 #pragma clang diagnostic pop
--- a/modules/core/src/mathfuncs.cpp
+++ b/modules/core/src/mathfuncs.cpp
@ -938,9 +938,40 @@ static bool ocl_pow(InputArray _src, double power, OutputArray _dst,
    bool issqrt = std::abs(power - 0.5) < DBL_EPSILON;
    const char * const op = issqrt ? "OP_SQRT" : is_ipower ? "OP_POWN" : "OP_POW";

+    // Note: channels are unrolled
+
+    std::string extra_opts ="";
+    if (is_ipower)
+    {
+        int wdepth = CV_32F;
+        if (depth == CV_64F)
+            wdepth = CV_64F;
+        else if (depth == CV_16F)
+            wdepth = CV_16F;
+
+        char cvt[2][50];
+        extra_opts = format(
+            " -D srcT1=%s -DsrcT1_C1=%s"
+            " -D srcT2=int -D workST=int"
+            " -D workT=%s -D wdepth=%d -D convertToWT1=%s"
+            " -D convertToDT=%s"
+            " -D workT1=%s",
+            ocl::typeToStr(CV_MAKE_TYPE(depth, 1)),
+            ocl::typeToStr(CV_MAKE_TYPE(depth, 1)),
+            ocl::typeToStr(CV_MAKE_TYPE(wdepth, 1)),
+            wdepth,
+            ocl::convertTypeStr(depth, wdepth, 1, cvt[0], sizeof(cvt[0])),
+            ocl::convertTypeStr(wdepth, depth, 1, cvt[1], sizeof(cvt[1])),
+            ocl::typeToStr(wdepth)
+        );
+    }
+
    ocl::Kernel k("KF", ocl::core::arithm_oclsrc,
-                  format("-D dstT=%s -D DEPTH_dst=%d -D rowsPerWI=%d -D %s -D UNARY_OP%s",
-                         ocl::typeToStr(depth), depth, rowsPerWI, op,
+                  format("-D cn=%d -D dstT=%s -D dstT_C1=%s -D DEPTH_dst=%d -D rowsPerWI=%d -D %s%s%s%s",
+                         1,
+                         ocl::typeToStr(depth), ocl::typeToStr(depth), depth, rowsPerWI, op,
+                         " -D UNARY_OP=1",
+                         extra_opts.empty() ? "" : extra_opts.c_str(),
                         doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
    if (k.empty())
        return false;
@ -1396,7 +1427,7 @@ int cv::solveCubic( InputArray _coeffs, OutputArray _roots )
    {
        if( a1 == 0 )
        {
-            if( a2 == 0 )
+            if( a2 == 0 ) // constant
                n = a3 == 0 ? -1 : 0;
            else
            {
@ -1430,15 +1461,23 @@ int cv::solveCubic( InputArray _coeffs, OutputArray _roots )
    }
    else
    {
+        // cubic equation
        a0 = 1./a0;
        a1 *= a0;
        a2 *= a0;
        a3 *= a0;

        double Q = (a1 * a1 - 3 * a2) * (1./9);
-        double R = (2 * a1 * a1 * a1 - 9 * a1 * a2 + 27 * a3) * (1./54);
+        double R = (a1 * (2 * a1 * a1 - 9 * a2) + 27 * a3) * (1./54);
        double Qcubed = Q * Q * Q;
-        double d = Qcubed - R * R;
+        /*
+          Here we expand expression `Qcubed - R * R` for `d` variable
+          to reduce common terms `a1^6 / 729` and `-a1^4 * a2 / 81`
+          and thus decrease rounding error (in case of quite big coefficients).
+
+          And then we additionally group terms to further reduce rounding error.
+        */
+        double d = (a1 * a1 * (a2 * a2 - 4 * a1 * a3) + 2 * a2 * (9 * a1 * a3 - 2 * a2 * a2) - 27 * a3 * a3) * (1./108);

        if( d > 0 )
        {
--- a/modules/core/src/norm.dispatch.cpp
+++ b/modules/core/src/norm.dispatch.cpp
@ -559,7 +559,7 @@ double norm( InputArray _src1, InputArray _src2, int normType, InputArray _mask
              ((normType == NORM_HAMMING || normType == NORM_HAMMING2) && src1.type() == CV_8U) );

    NormDiffFunc func = getNormDiffFunc(normType >> 1, depth);
-    CV_Assert( func != 0 );
+    CV_Assert( (normType >> 1) >= 3 || func != 0 );

    if( src1.isContinuous() && src2.isContinuous() && mask.empty() )
    {
--- a/modules/core/src/norm.simd.hpp
+++ b/modules/core/src/norm.simd.hpp
@ -1581,6 +1581,7 @@ NormDiffFunc getNormDiffFunc(int normType, int depth)
            0
        },
    };
+    if (normType >= 3 || normType < 0) return nullptr;

    return normDiffTab[normType][depth];
 }
--- a/modules/core/src/opencl/arithm.cl
+++ b/modules/core/src/opencl/arithm.cl
@ -80,6 +80,10 @@
 #error "Kernel configuration error: ambiguous 'depth' value is defined, use 'DEPTH_dst' instead"
 #endif

+#define CAT__(x, y) x ## y
+#define CAT_(x, y) CAT__(x, y)
+#define CAT(x, y) CAT_(x, y)
+

 #if DEPTH_dst < 5 /* CV_32F */
 #define CV_DST_TYPE_IS_INTEGER
@ -325,9 +329,12 @@
 #define PROCESS_ELEM storedst(pow(srcelem1, srcelem2))

 #elif defined OP_POWN
-#undef workT
-#define workT int
-#define PROCESS_ELEM storedst(pown(srcelem1, srcelem2))
+#if cn > 1
+#define PROCESS_INIT CAT(int, cn) powi = (CAT(int, cn))srcelem2;
+#else // cn
+#define PROCESS_INIT int powi = srcelem2;
+#endif
+#define PROCESS_ELEM storedst(convertToDT(pown(srcelem1, powi)))

 #elif defined OP_SQRT
 #if CV_DST_TYPE_FIT_32F
@ -469,7 +476,7 @@
    #define srcelem2 srcelem2_
 #endif

-#if cn == 3
+#if !defined(PROCESS_INIT) && cn == 3
 #undef srcelem2
 #define srcelem2 (workT)(srcelem2_.x, srcelem2_.y, srcelem2_.z)
 #endif
@ -517,6 +524,10 @@ __kernel void KF(__global const uchar * srcptr1, int srcstep1, int srcoffset1,
    int x = get_global_id(0);
    int y0 = get_global_id(1) * rowsPerWI;

+#ifdef PROCESS_INIT
+    PROCESS_INIT
+#endif
+
    if (x < cols)
    {
        int mask_index = mad24(y0, maskstep, x + maskoffset);
@ -542,6 +553,10 @@ __kernel void KF(__global const uchar * srcptr1, int srcstep1, int srcoffset1,
    int x = get_global_id(0);
    int y0 = get_global_id(1) * rowsPerWI;

+#ifdef PROCESS_INIT
+    PROCESS_INIT
+#endif
+
    if (x < cols)
    {
        int src1_index = mad24(y0, srcstep1, mad24(x, (int)sizeof(srcT1_C1) * cn, srcoffset1));
@ -564,6 +579,10 @@ __kernel void KF(__global const uchar * srcptr1, int srcstep1, int srcoffset1,
    int x = get_global_id(0);
    int y0 = get_global_id(1) * rowsPerWI;

+#ifdef PROCESS_INIT
+    PROCESS_INIT
+#endif
+
    if (x < cols)
    {
        int mask_index = mad24(y0, maskstep, x + maskoffset);
--- a/modules/core/src/sum.dispatch.cpp
+++ b/modules/core/src/sum.dispatch.cpp
@ -10,14 +10,6 @@
 #include "sum.simd.hpp"
 #include "sum.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content

-#ifndef OPENCV_IPP_SUM
-#undef HAVE_IPP
-#undef CV_IPP_RUN_FAST
-#define CV_IPP_RUN_FAST(f, ...)
-#undef CV_IPP_RUN
-#define CV_IPP_RUN(c, f, ...)
-#endif // OPENCV_IPP_SUM
-
 namespace cv
 {

@ -126,95 +118,45 @@ bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask,

 #endif

-#ifdef HAVE_IPP
-static bool ipp_sum(Mat &src, Scalar &_res)
-{
-    CV_INSTRUMENT_REGION_IPP();
-
-#if IPP_VERSION_X100 >= 700
-    int cn = src.channels();
-    if (cn > 4)
-        return false;
-    size_t total_size = src.total();
-    int rows = src.size[0], cols = rows ? (int)(total_size/rows) : 0;
-    if( src.dims <= 2 || (src.isContinuous() && cols > 0 && (size_t)rows*cols == total_size) )
-    {
-        IppiSize sz = { cols, rows };
-        int type = src.type();
-        typedef IppStatus (CV_STDCALL* ippiSumFuncHint)(const void*, int, IppiSize, double *, IppHintAlgorithm);
-        typedef IppStatus (CV_STDCALL* ippiSumFuncNoHint)(const void*, int, IppiSize, double *);
-        ippiSumFuncHint ippiSumHint =
-            type == CV_32FC1 ? (ippiSumFuncHint)ippiSum_32f_C1R :
-            type == CV_32FC3 ? (ippiSumFuncHint)ippiSum_32f_C3R :
-            type == CV_32FC4 ? (ippiSumFuncHint)ippiSum_32f_C4R :
-            0;
-        ippiSumFuncNoHint ippiSum =
-            type == CV_8UC1 ? (ippiSumFuncNoHint)ippiSum_8u_C1R :
-            type == CV_8UC3 ? (ippiSumFuncNoHint)ippiSum_8u_C3R :
-            type == CV_8UC4 ? (ippiSumFuncNoHint)ippiSum_8u_C4R :
-            type == CV_16UC1 ? (ippiSumFuncNoHint)ippiSum_16u_C1R :
-            type == CV_16UC3 ? (ippiSumFuncNoHint)ippiSum_16u_C3R :
-            type == CV_16UC4 ? (ippiSumFuncNoHint)ippiSum_16u_C4R :
-            type == CV_16SC1 ? (ippiSumFuncNoHint)ippiSum_16s_C1R :
-            type == CV_16SC3 ? (ippiSumFuncNoHint)ippiSum_16s_C3R :
-            type == CV_16SC4 ? (ippiSumFuncNoHint)ippiSum_16s_C4R :
-            0;
-        CV_Assert(!ippiSumHint || !ippiSum);
-        if( ippiSumHint || ippiSum )
-        {
-            Ipp64f res[4];
-            IppStatus ret = ippiSumHint ?
-                            CV_INSTRUMENT_FUN_IPP(ippiSumHint, src.ptr(), (int)src.step[0], sz, res, ippAlgHintAccurate) :
-                            CV_INSTRUMENT_FUN_IPP(ippiSum, src.ptr(), (int)src.step[0], sz, res);
-            if( ret >= 0 )
-            {
-                for( int i = 0; i < cn; i++ )
-                    _res[i] = res[i];
-                return true;
-            }
-        }
-    }
-#else
-    CV_UNUSED(src); CV_UNUSED(_res);
-#endif
-    return false;
-}
-#endif
-
 Scalar sum(InputArray _src)
 {
    CV_INSTRUMENT_REGION();

-#if defined HAVE_OPENCL || defined HAVE_IPP
    Scalar _res;
-#endif
-
 #ifdef HAVE_OPENCL
    CV_OCL_RUN_(OCL_PERFORMANCE_CHECK(_src.isUMat()) && _src.dims() <= 2,
                ocl_sum(_src, _res, OCL_OP_SUM),
-                _res)
+                _res);
 #endif

    Mat src = _src.getMat();
-    CV_IPP_RUN(IPP_VERSION_X100 >= 700, ipp_sum(src, _res), _res);
+    int cn = src.channels();
+    CV_CheckLE( cn, 4, "cv::sum does not support more than 4 channels" );

-    int k, cn = src.channels(), depth = src.depth();
+    if (_src.dims() <= 2)
+    {
+        CALL_HAL_RET2(sum, cv_hal_sum, _res, src.data, src.step, src.type(), src.cols, src.rows, &_res[0]);
+    }
+    else if (_src.isContinuous())
+    {
+        CALL_HAL_RET2(sum, cv_hal_sum, _res, src.data, 0, src.type(), (int)src.total(), 1, &_res[0]);
+    }
+
+    int k, depth = src.depth();
    SumFunc func = getSumFunc(depth);
    if (func == nullptr) {
        if (depth == CV_Bool && cn == 1)
            return Scalar((double)countNonZero(src));
        CV_Error(Error::StsNotImplemented, "");
    }
-    CV_Assert( cn <= 4 && func != 0 );

    const Mat* arrays[] = {&src, 0};
    uchar* ptrs[1] = {};
    NAryMatIterator it(arrays, ptrs);
-    Scalar s;
    int total = (int)it.size, blockSize = total, partialBlockSize = 0;
    int j, count = 0;
    int _buf[CV_CN_MAX];
-    int* buf = (int*)&s[0];
+    int* buf = (int*)&_res[0];
    size_t esz = 0;
    bool partialSumIsInt = depth < CV_32S;
    bool blockSum = partialSumIsInt || depth == CV_16F || depth == CV_16BF;
@ -241,13 +183,13 @@ Scalar sum(InputArray _src)
                if (partialSumIsInt) {
                    for( k = 0; k < cn; k++ )
                    {
-                        s[k] += buf[k];
+                        _res[k] += buf[k];
                        buf[k] = 0;
                    }
                } else {
                    for( k = 0; k < cn; k++ )
                    {
-                        s[k] += ((float*)buf)[k];
+                        _res[k] += ((float*)buf)[k];
                        buf[k] = 0;
                    }
                }
@ -256,7 +198,7 @@ Scalar sum(InputArray _src)
            ptrs[0] += bsz*esz;
        }
    }
-    return s;
+    return _res;
 }

 } // namespace
--- a/modules/core/test/ocl/test_arithm.cpp
+++ b/modules/core/test/ocl/test_arithm.cpp
@ -132,19 +132,25 @@ PARAM_TEST_CASE(ArithmTestBase, MatDepth, Channels, bool)
        use_roi = GET_PARAM(2);
    }

-    void generateTestData(bool with_val_in_range = false)
+    void generateTestData(bool with_val_in_range = false,
+        double minVal1 = std::numeric_limits<double>::quiet_NaN(), double maxVal1 = std::numeric_limits<double>::quiet_NaN(),
+        double minVal2 = std::numeric_limits<double>::quiet_NaN(), double maxVal2 = std::numeric_limits<double>::quiet_NaN()
+    )
    {
        const int type = CV_MAKE_TYPE(depth, cn);

-        double minV = cvtest::getMinVal(type);
-        double maxV = cvtest::getMaxVal(type);
+        double minV1 = cvIsNaN(minVal1) ? 2 : minVal1;
+        double maxV1 = cvIsNaN(maxVal1) ? 11 : maxVal1;
+
+        double minV2 = cvIsNaN(minVal2) ? std::max(-1540., cvtest::getMinVal(type)) : minVal2;
+        double maxV2 = cvIsNaN(maxVal2) ? std::min(1740., cvtest::getMaxVal(type)) : maxVal2;

        Size roiSize = randomSize(1, MAX_VALUE);
        Border src1Border = randomBorder(0, use_roi ? MAX_VALUE : 0);
-        randomSubMat(src1, src1_roi, roiSize, src1Border, type, 2, 11); // FIXIT: Test with minV, maxV
+        randomSubMat(src1, src1_roi, roiSize, src1Border, type, minV1, maxV1); // FIXIT: Test with minV, maxV

        Border src2Border = randomBorder(0, use_roi ? MAX_VALUE : 0);
-        randomSubMat(src2, src2_roi, roiSize, src2Border, type, std::max(-1540., minV), std::min(1740., maxV));
+        randomSubMat(src2, src2_roi, roiSize, src2Border, type, minV2, maxV2);

        Border dst1Border = randomBorder(0, use_roi ? MAX_VALUE : 0);
        randomSubMat(dst1, dst1_roi, roiSize, dst1Border, type, 5, 16);
@ -162,8 +168,8 @@ PARAM_TEST_CASE(ArithmTestBase, MatDepth, Channels, bool)

        if (with_val_in_range)
        {
-            val_in_range = cv::Scalar(rng.uniform(minV, maxV), rng.uniform(minV, maxV),
-                                      rng.uniform(minV, maxV), rng.uniform(minV, maxV));
+            val_in_range = cv::Scalar(rng.uniform(minV1, maxV1), rng.uniform(minV1, maxV1),
+                                      rng.uniform(minV1, maxV1), rng.uniform(minV1, maxV1));
        }

        UMAT_UPLOAD_INPUT_PARAMETER(src1);
@ -844,14 +850,30 @@ OCL_TEST_P(Pow, Mat)
    for (int j = 0; j < 1/*test_loop_times*/; j++)
        for (int k = 0, size = sizeof(pows) / sizeof(double); k < size; ++k)
        {
-            SCOPED_TRACE(pows[k]);
+            SCOPED_TRACE(cv::format("POW=%g", pows[k]));

-            generateTestData();
+            generateTestData(false, 1, 3);

            OCL_OFF(cv::pow(src1_roi, pows[k], dst1_roi));
            OCL_ON(cv::pow(usrc1_roi, pows[k], udst1_roi));

            OCL_EXPECT_MATS_NEAR_RELATIVE(dst1, 1e-5);
+
+            if (cvtest::debugLevel >= 100)
+            {
+                cv::Rect roi(0, 0, 4, 4);
+                std::cout << src1_roi(roi) << std::endl;
+                std::cout << dst1_roi(roi) << std::endl;
+                std::cout << udst1_roi(roi) << std::endl;
+
+                Mat diff;
+                cv::absdiff(dst1_roi, udst1_roi, diff);
+                std::cout << std::endl << diff(roi) << std::endl;
+
+                std::cout << std::endl << dst1_roi << std::endl;
+                std::cout << std::endl << udst1_roi << std::endl;
+                std::cout << std::endl << diff << std::endl;
+            }
        }
 }

--- a/Show More
+++ b/Show More