From fa58c1205b5a3149e9b75f74c796593debad319d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=A4=A9=E9=9F=B3=E3=81=82=E3=82=81?=
 <haofen.opencv@isrc.iscas.ac.cn>
Date: Tue, 25 Mar 2025 16:57:47 +0800
Subject: [PATCH] Merge pull request #27119 from amane-ame:warp_hal_rvv

Add RISC-V HAL implementation for cv::warp series #27119

This patch implements `cv_hal_remap`, `cv_hal_warpAffine` and `cv_hal_warpPerspective` using native intrinsics, optimizing the performance of `cv::remap/cv::warpAffine/cv::warpPerspective` for `CV_HAL_INTER_NEAREST/CV_HAL_INTER_LINEAR/CV_HAL_INTER_CUBIC/CV_HAL_INTER_LANCZOS4` modes.

Tested on MUSE-PI (Spacemit X60) for both gcc 14.2 and clang 20.0.

```
$ ./opencv_test_imgproc --gtest_filter="*Remap*:*Warp*"
$ ./opencv_perf_imgproc --gtest_filter="*Remap*:*remap*:*Warp*" --perf_min_samples=200 --perf_force_samples=200
```

View the full perf table here: [hal_rvv_warp.pdf](https://github.com/user-attachments/files/19403718/hal_rvv_warp.pdf)

### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
---
 3rdparty/hal_rvv/hal_rvv.hpp            |    1 +
 3rdparty/hal_rvv/hal_rvv_1p0/types.hpp  |    6 +-
 3rdparty/hal_rvv/hal_rvv_1p0/warp.hpp   | 1208 +++++++++++++++++++++++
 modules/imgproc/src/hal_replacement.hpp |   49 +
 modules/imgproc/src/imgwarp.cpp         |   10 +
 5 files changed, 1272 insertions(+), 2 deletions(-)
 create mode 100644 3rdparty/hal_rvv/hal_rvv_1p0/warp.hpp
diff --git a/3rdparty/hal_rvv/hal_rvv.hpp b/3rdparty/hal_rvv/hal_rvv.hpp
index 8287ebeda9..4a10906a33 100644
--- a/3rdparty/hal_rvv/hal_rvv.hpp
+++ b/3rdparty/hal_rvv/hal_rvv.hpp
@@ -50,6 +50,7 @@
 #include "hal_rvv_1p0/filter.hpp" // imgproc
 #include "hal_rvv_1p0/pyramids.hpp" // imgproc
 #include "hal_rvv_1p0/color.hpp" // imgproc
+#include "hal_rvv_1p0/warp.hpp" // imgproc
 #include "hal_rvv_1p0/thresh.hpp" // imgproc
 #include "hal_rvv_1p0/histogram.hpp" // imgproc
 #endif
diff --git a/3rdparty/hal_rvv/hal_rvv_1p0/types.hpp b/3rdparty/hal_rvv/hal_rvv_1p0/types.hpp
index 79db847eb5..8c8ad23787 100644
--- a/3rdparty/hal_rvv/hal_rvv_1p0/types.hpp
+++ b/3rdparty/hal_rvv/hal_rvv_1p0/types.hpp
@@ -91,10 +91,12 @@ using RVV_F64M2 = struct RVV<double, LMUL_2>;
 using RVV_F64M4 = struct RVV<double, LMUL_4>;
 using RVV_F64M8 = struct RVV<double, LMUL_8>;
 
-// Only for dst type lmul >= 1
 template <typename Dst_T, typename RVV_T>
 using RVV_SameLen =
-    RVV<Dst_T, RVV_LMUL(RVV_T::lmul * sizeof(Dst_T) / sizeof(typename RVV_T::ElemType))>;
+    RVV<Dst_T, RVV_LMUL(static_cast<int>((RVV_T::lmul <= 8 ? RVV_T::lmul * static_cast<float>(sizeof(Dst_T)) : RVV_T::lmul == 9 ? static_cast<float>(sizeof(Dst_T)) / 2 : RVV_T::lmul == 10 ? static_cast<float>(sizeof(Dst_T)) / 4 : static_cast<float>(sizeof(Dst_T)) / 8) / sizeof(typename RVV_T::ElemType) == 0.5   ? 9  : \
+                                         (RVV_T::lmul <= 8 ? RVV_T::lmul * static_cast<float>(sizeof(Dst_T)) : RVV_T::lmul == 9 ? static_cast<float>(sizeof(Dst_T)) / 2 : RVV_T::lmul == 10 ? static_cast<float>(sizeof(Dst_T)) / 4 : static_cast<float>(sizeof(Dst_T)) / 8) / sizeof(typename RVV_T::ElemType) == 0.25  ? 10 : \
+                                         (RVV_T::lmul <= 8 ? RVV_T::lmul * static_cast<float>(sizeof(Dst_T)) : RVV_T::lmul == 9 ? static_cast<float>(sizeof(Dst_T)) / 2 : RVV_T::lmul == 10 ? static_cast<float>(sizeof(Dst_T)) / 4 : static_cast<float>(sizeof(Dst_T)) / 8) / sizeof(typename RVV_T::ElemType) == 0.125 ? 11 : \
+                                         (RVV_T::lmul <= 8 ? RVV_T::lmul * static_cast<float>(sizeof(Dst_T)) : RVV_T::lmul == 9 ? static_cast<float>(sizeof(Dst_T)) / 2 : RVV_T::lmul == 10 ? static_cast<float>(sizeof(Dst_T)) / 4 : static_cast<float>(sizeof(Dst_T)) / 8) / sizeof(typename RVV_T::ElemType)))>;
 
 template <size_t DstSize> struct RVV_ToIntHelper;
 template <size_t DstSize> struct RVV_ToUintHelper;
diff --git a/3rdparty/hal_rvv/hal_rvv_1p0/warp.hpp b/3rdparty/hal_rvv/hal_rvv_1p0/warp.hpp
new file mode 100644
index 0000000000..d9fcf9c109
--- /dev/null
+++ b/3rdparty/hal_rvv/hal_rvv_1p0/warp.hpp
@@ -0,0 +1,1208 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
+
+#ifndef OPENCV_HAL_RVV_WARP_HPP_INCLUDED
+#define OPENCV_HAL_RVV_WARP_HPP_INCLUDED
+
+#include <riscv_vector.h>
+
+namespace cv { namespace cv_hal_rvv {
+
+namespace remap {
+#undef cv_hal_remap32f
+#define cv_hal_remap32f cv::cv_hal_rvv::remap::remap32f
+#undef cv_hal_remap32fc2
+#define cv_hal_remap32fc2 cv::cv_hal_rvv::remap::remap32fc2
+#undef cv_hal_remap16s
+#define cv_hal_remap16s cv::cv_hal_rvv::remap::remap16s
+
+class RemapInvoker : public ParallelLoopBody
+{
+public:
+    template<typename... Args>
+    RemapInvoker(std::function<int(int, int, Args...)> _func, Args&&... args)
+    {
+        func = std::bind(_func, std::placeholders::_1, std::placeholders::_2, std::forward<Args>(args)...);
+    }
+
+    virtual void operator()(const Range& range) const override
+    {
+        func(range.start, range.end);
+    }
+
+private:
+    std::function<int(int, int)> func;
+};
+
+template<typename... Args>
+static inline int invoke(int width, int height, std::function<int(int, int, Args...)> func, Args&&... args)
+{
+    cv::parallel_for_(Range(1, height), RemapInvoker(func, std::forward<Args>(args)...), static_cast<double>((width - 1) * height) / (1 << 15));
+    return func(0, 1, std::forward<Args>(args)...);
+}
+
+template<typename T> struct rvv;
+// NN & LINEAR
+template<> struct rvv<RVV_U8M2>
+{
+    static inline vfloat32m8_t vcvt0(vuint8m2_t a, size_t b) { return __riscv_vfcvt_f(__riscv_vzext_vf4(a, b), b); }
+    static inline vuint8m2_t vcvt1(vfloat32m8_t a, size_t b) { return __riscv_vnclipu(__riscv_vfncvt_xu(a, b), 0, __RISCV_VXRM_RNU, b); }
+    static inline vuint8m2_t vloxei(const uchar* a, vuint32m8_t b, size_t c) { return __riscv_vloxei32_v_u8m2(a, b, c); }
+};
+template<> struct rvv<RVV_U16M4>
+{
+    static inline vfloat32m8_t vcvt0(vuint16m4_t a, size_t b) { return __riscv_vfwcvt_f(a, b); }
+    static inline vuint16m4_t vcvt1(vfloat32m8_t a, size_t b) { return __riscv_vfncvt_xu(a, b); }
+    static inline vuint16m4_t vloxei(const ushort* a, vuint32m8_t b, size_t c) { return __riscv_vloxei32_v_u16m4(a, b, c); }
+};
+template<> struct rvv<RVV_I16M4>
+{
+    static inline vfloat32m8_t vcvt0(vint16m4_t a, size_t b) { return __riscv_vfwcvt_f(a, b); }
+    static inline vint16m4_t vcvt1(vfloat32m8_t a, size_t b) { return __riscv_vfncvt_x(a, b); }
+    static inline vint16m4_t vloxei(const short* a, vuint32m8_t b, size_t c) { return __riscv_vloxei32_v_i16m4(a, b, c); }
+};
+template<> struct rvv<RVV_F32M8>
+{
+    static inline vfloat32m8_t vcvt0(vfloat32m8_t a, size_t) { return a; }
+    static inline vfloat32m8_t vcvt1(vfloat32m8_t a, size_t) { return a; }
+    static inline vfloat32m8_t vloxei(const float* a, vuint32m8_t b, size_t c) { return __riscv_vloxei32_v_f32m8(a, b, c); }
+};
+// CUBIC
+template<> struct rvv<RVV_U8MF4>
+{
+    static inline vfloat32m1_t vcvt0(vuint8mf4_t a, size_t b) { return __riscv_vfcvt_f(__riscv_vzext_vf4(a, b), b); }
+    static inline vuint8mf4_t vcvt1(vfloat32m1_t a, size_t b) { return __riscv_vnclipu(__riscv_vfncvt_xu(a, b), 0, __RISCV_VXRM_RNU, b); }
+    static inline vuint8mf4_t vloxei(const uchar* a, vuint32m1_t b, size_t c) { return __riscv_vloxei32_v_u8mf4(a, b, c); }
+};
+template<> struct rvv<RVV_U16MF2>
+{
+    static inline vfloat32m1_t vcvt0(vuint16mf2_t a, size_t b) { return __riscv_vfwcvt_f(a, b); }
+    static inline vuint16mf2_t vcvt1(vfloat32m1_t a, size_t b) { return __riscv_vfncvt_xu(a, b); }
+    static inline vuint16mf2_t vloxei(const ushort* a, vuint32m1_t b, size_t c) { return __riscv_vloxei32_v_u16mf2(a, b, c); }
+};
+template<> struct rvv<RVV_I16MF2>
+{
+    static inline vfloat32m1_t vcvt0(vint16mf2_t a, size_t b) { return __riscv_vfwcvt_f(a, b); }
+    static inline vint16mf2_t vcvt1(vfloat32m1_t a, size_t b) { return __riscv_vfncvt_x(a, b); }
+    static inline vint16mf2_t vloxei(const short* a, vuint32m1_t b, size_t c) { return __riscv_vloxei32_v_i16mf2(a, b, c); }
+};
+template<> struct rvv<RVV_F32M1>
+{
+    static inline vfloat32m1_t vcvt0(vfloat32m1_t a, size_t) { return a; }
+    static inline vfloat32m1_t vcvt1(vfloat32m1_t a, size_t) { return a; }
+    static inline vfloat32m1_t vloxei(const float* a, vuint32m1_t b, size_t c) { return __riscv_vloxei32_v_f32m1(a, b, c); }
+};
+// LANCZOS4
+template<> struct rvv<RVV_U8MF2>
+{
+    static inline vfloat32m2_t vcvt0(vuint8mf2_t a, size_t b) { return __riscv_vfcvt_f(__riscv_vzext_vf4(a, b), b); }
+    static inline vuint8mf2_t vcvt1(vfloat32m2_t a, size_t b) { return __riscv_vnclipu(__riscv_vfncvt_xu(a, b), 0, __RISCV_VXRM_RNU, b); }
+    static inline vuint8mf2_t vloxei(const uchar* a, vuint32m2_t b, size_t c) { return __riscv_vloxei32_v_u8mf2(a, b, c); }
+};
+template<> struct rvv<RVV_U16M1>
+{
+    static inline vfloat32m2_t vcvt0(vuint16m1_t a, size_t b) { return __riscv_vfwcvt_f(a, b); }
+    static inline vuint16m1_t vcvt1(vfloat32m2_t a, size_t b) { return __riscv_vfncvt_xu(a, b); }
+    static inline vuint16m1_t vloxei(const ushort* a, vuint32m2_t b, size_t c) { return __riscv_vloxei32_v_u16m1(a, b, c); }
+};
+template<> struct rvv<RVV_I16M1>
+{
+    static inline vfloat32m2_t vcvt0(vint16m1_t a, size_t b) { return __riscv_vfwcvt_f(a, b); }
+    static inline vint16m1_t vcvt1(vfloat32m2_t a, size_t b) { return __riscv_vfncvt_x(a, b); }
+    static inline vint16m1_t vloxei(const short* a, vuint32m2_t b, size_t c) { return __riscv_vloxei32_v_i16m1(a, b, c); }
+};
+template<> struct rvv<RVV_F32M2>
+{
+    static inline vfloat32m2_t vcvt0(vfloat32m2_t a, size_t) { return a; }
+    static inline vfloat32m2_t vcvt1(vfloat32m2_t a, size_t) { return a; }
+    static inline vfloat32m2_t vloxei(const float* a, vuint32m2_t b, size_t c) { return __riscv_vloxei32_v_f32m2(a, b, c); }
+};
+
+template<typename helper>
+static inline int remap32fC1(int start, int end, bool s16, const uchar *src_data, size_t src_step, int src_width, int src_height,
+                             uchar *dst_data, size_t dst_step, int dst_width,
+                             const float* mapx, size_t mapx_step, const float* mapy, size_t mapy_step,
+                             int interpolation, int border_type, const double* border_value)
+{
+    using T = typename helper::ElemType;
+    const int mode = interpolation & ~CV_HAL_WARP_RELATIVE_MAP;
+
+    for (int i = start; i < end; i++)
+    {
+        int vl;
+        for (int j = 0; j < dst_width; j += vl)
+        {
+            vl = helper::setvl(dst_width - j);
+            typename RVV_SameLen<float, helper>::VecType mx, my;
+            if (s16)
+            {
+                auto map = __riscv_vlseg2e16_v_i16m4x2(reinterpret_cast<const short*>(mapx) + i * mapx_step + j * 2, vl);
+                mx = __riscv_vfwcvt_f(__riscv_vget_v_i16m4x2_i16m4(map, 0), vl);
+                my = __riscv_vfwcvt_f(__riscv_vget_v_i16m4x2_i16m4(map, 1), vl);
+            }
+            else
+            {
+                if (mapy == nullptr)
+                {
+                    mx = RVV_SameLen<float, helper>::vload_stride(mapx + i * mapx_step + j * 2    , sizeof(float) * 2, vl);
+                    my = RVV_SameLen<float, helper>::vload_stride(mapx + i * mapx_step + j * 2 + 1, sizeof(float) * 2, vl);
+                }
+                else
+                {
+                    mx = RVV_SameLen<float, helper>::vload(mapx + i * mapx_step + j, vl);
+                    my = RVV_SameLen<float, helper>::vload(mapy + i * mapy_step + j, vl);
+                }
+            }
+            if (interpolation & CV_HAL_WARP_RELATIVE_MAP)
+            {
+                mx = __riscv_vfadd(mx, __riscv_vfcvt_f(__riscv_vadd(RVV_SameLen<uint, helper>::vid(vl), j, vl), vl), vl);
+                my = __riscv_vfadd(my, i, vl);
+            }
+
+            auto access = [&](typename RVV_SameLen<int, helper>::VecType ix, typename RVV_SameLen<int, helper>::VecType iy) {
+                auto ux = RVV_SameLen<uint, helper>::reinterpret(__riscv_vmin(__riscv_vmax(ix, 0, vl), src_width  - 1, vl));
+                auto uy = RVV_SameLen<uint, helper>::reinterpret(__riscv_vmin(__riscv_vmax(iy, 0, vl), src_height - 1, vl));
+                auto src = rvv<helper>::vloxei(reinterpret_cast<const T*>(src_data), __riscv_vmadd(uy, src_step, __riscv_vmul(ux, sizeof(T), vl), vl), vl);
+                if (border_type == CV_HAL_BORDER_CONSTANT)
+                {
+                    auto mask = __riscv_vmor(__riscv_vmsne(ix, RVV_SameLen<int, helper>::reinterpret(ux), vl), __riscv_vmsne(iy, RVV_SameLen<int, helper>::reinterpret(uy), vl), vl);
+                    src = __riscv_vmerge(src, helper::vmv(border_value[0], vl), mask, vl);
+                }
+                return src;
+            };
+            if (mode == CV_HAL_INTER_NEAREST)
+            {
+                auto ix = __riscv_vfcvt_x(mx, vl), iy = __riscv_vfcvt_x(my, vl);
+                helper::vstore(reinterpret_cast<T*>(dst_data + i * dst_step) + j, access(ix, iy), vl);
+            }
+            else if (mode == CV_HAL_INTER_LINEAR)
+            {
+                typename RVV_SameLen<int, helper>::VecType ix0, iy0;
+                if (s16)
+                {
+                    ix0 = __riscv_vfcvt_x(mx, vl);
+                    iy0 = __riscv_vfcvt_x(my, vl);
+                    auto md = __riscv_vle16_v_u16m4(reinterpret_cast<const ushort*>(mapy) + i * mapy_step + j, vl);
+                    mx = __riscv_vfdiv(__riscv_vfwcvt_f(__riscv_vand(md, 31, vl), vl), 32, vl);
+                    my = __riscv_vfdiv(__riscv_vfwcvt_f(__riscv_vand(__riscv_vsrl(md, 5, vl), 31, vl), vl), 32, vl);
+                }
+                else
+                {
+                    auto imx = __riscv_vfcvt_x(__riscv_vfmul(mx, 32, vl), vl);
+                    auto imy = __riscv_vfcvt_x(__riscv_vfmul(my, 32, vl), vl);
+                    ix0 = __riscv_vsra(imx, 5, vl);
+                    iy0 = __riscv_vsra(imy, 5, vl);
+                    mx = __riscv_vfdiv(__riscv_vfcvt_f(__riscv_vand(imx, 31, vl), vl), 32, vl);
+                    my = __riscv_vfdiv(__riscv_vfcvt_f(__riscv_vand(imy, 31, vl), vl), 32, vl);
+                }
+                auto ix1 = __riscv_vadd(ix0, 1, vl), iy1 = __riscv_vadd(iy0, 1, vl);
+                auto v0 = rvv<helper>::vcvt0(access(ix0, iy0), vl);
+                auto v1 = rvv<helper>::vcvt0(access(ix1, iy0), vl);
+                auto v2 = rvv<helper>::vcvt0(access(ix0, iy1), vl);
+                auto v3 = rvv<helper>::vcvt0(access(ix1, iy1), vl);
+
+                v0 = __riscv_vfmacc(v0, mx, __riscv_vfsub(v1, v0, vl), vl);
+                v2 = __riscv_vfmacc(v2, mx, __riscv_vfsub(v3, v2, vl), vl);
+                v0 = __riscv_vfmacc(v0, my, __riscv_vfsub(v2, v0, vl), vl);
+                helper::vstore(reinterpret_cast<T*>(dst_data + i * dst_step) + j, rvv<helper>::vcvt1(v0, vl), vl);
+            }
+            else
+            {
+                return CV_HAL_ERROR_NOT_IMPLEMENTED;
+            }
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+class RemapTable
+{
+private:
+    RemapTable()
+    {
+        // the algorithm is copied from imgproc/src/imgwarp.cpp,
+        // in the function static void interpolateLanczos4
+        constexpr double s45 = 0.70710678118654752440084436210485;
+        constexpr double cs[][2] = {{1, 0}, {-s45, -s45}, {0, 1}, {s45, -s45}, {-1, 0}, {s45, s45}, {0, -1}, {-s45, s45}};
+
+        for (int t = 0; t < 32; t++)
+        {
+            float x = t / 32.0f;
+            if (x < FLT_EPSILON)
+            {
+                for (int i = 0; i < 8; i++)
+                    coeffs[t*8+i] = 0;
+                coeffs[t*8+3] = 1;
+                return;
+            }
+
+            float sum = 0;
+            double y0=-(x+3)*CV_PI*0.25, s0 = std::sin(y0), c0= std::cos(y0);
+            for (int i = 0; i < 8; i++)
+            {
+                double y = -(x+3-i)*CV_PI*0.25;
+                coeffs[t*8+i] = (float)((cs[i][0]*s0 + cs[i][1]*c0)/(y*y));
+                sum += coeffs[t*8+i];
+            }
+
+            sum = 1.f/sum;
+            for (int i = 0; i < 8; i++)
+                coeffs[t*8+i] *= sum;
+        }
+    }
+
+public:
+    float coeffs[32 * 8];
+
+    static RemapTable& instance()
+    {
+        static RemapTable tab;
+        return tab;
+    }
+};
+
+template<typename helper>
+static inline int remap32fCubic(int start, int end, bool s16, const uchar *src_data, size_t src_step, int src_width, int src_height,
+                                uchar *dst_data, size_t dst_step, int dst_width,
+                                const float* mapx, size_t mapx_step, const float* mapy, size_t mapy_step,
+                                int interpolation, int border_type, const double* border_value)
+{
+    using T = typename helper::ElemType;
+
+    for (int i = start; i < end; i++)
+    {
+        int vl;
+        for (int j = 0; j < dst_width; j += vl)
+        {
+            vl = helper::setvl(dst_width - j);
+            typename RVV_SameLen<float, helper>::VecType mx, my;
+            if (s16)
+            {
+                auto map = __riscv_vlseg2e16_v_i16mf2x2(reinterpret_cast<const short*>(mapx) + i * mapx_step + j * 2, vl);
+                mx = __riscv_vfwcvt_f(__riscv_vget_v_i16mf2x2_i16mf2(map, 0), vl);
+                my = __riscv_vfwcvt_f(__riscv_vget_v_i16mf2x2_i16mf2(map, 1), vl);
+            }
+            else
+            {
+                if (mapy == nullptr)
+                {
+                    auto map = __riscv_vlseg2e32_v_f32m1x2(mapx + i * mapx_step + j * 2, vl);
+                    mx = __riscv_vget_v_f32m1x2_f32m1(map, 0);
+                    my = __riscv_vget_v_f32m1x2_f32m1(map, 1);
+                }
+                else
+                {
+                    mx = RVV_SameLen<float, helper>::vload(mapx + i * mapx_step + j, vl);
+                    my = RVV_SameLen<float, helper>::vload(mapy + i * mapy_step + j, vl);
+                }
+            }
+            if (interpolation & CV_HAL_WARP_RELATIVE_MAP)
+            {
+                mx = __riscv_vfadd(mx, __riscv_vfcvt_f(__riscv_vadd(RVV_SameLen<uint, helper>::vid(vl), j, vl), vl), vl);
+                my = __riscv_vfadd(my, i, vl);
+            }
+
+            auto access = [&](typename RVV_SameLen<int, helper>::VecType ix, typename RVV_SameLen<int, helper>::VecType iy) {
+                auto ux = RVV_SameLen<uint, helper>::reinterpret(__riscv_vmin(__riscv_vmax(ix, 0, vl), src_width  - 1, vl));
+                auto uy = RVV_SameLen<uint, helper>::reinterpret(__riscv_vmin(__riscv_vmax(iy, 0, vl), src_height - 1, vl));
+                auto src = rvv<helper>::vloxei(reinterpret_cast<const T*>(src_data), __riscv_vmadd(uy, src_step, __riscv_vmul(ux, sizeof(T), vl), vl), vl);
+                if (border_type == CV_HAL_BORDER_CONSTANT)
+                {
+                    auto mask = __riscv_vmor(__riscv_vmsne(ix, RVV_SameLen<int, helper>::reinterpret(ux), vl), __riscv_vmsne(iy, RVV_SameLen<int, helper>::reinterpret(uy), vl), vl);
+                    src = __riscv_vmerge(src, helper::vmv(border_value[0], vl), mask, vl);
+                }
+                return src;
+            };
+
+            typename RVV_SameLen<int, helper>::VecType ix1, iy1;
+            if (s16)
+            {
+                ix1 = __riscv_vfcvt_x(mx, vl);
+                iy1 = __riscv_vfcvt_x(my, vl);
+                auto md = __riscv_vle16_v_u16mf2(reinterpret_cast<const ushort*>(mapy) + i * mapy_step + j, vl);
+                mx = __riscv_vfdiv(__riscv_vfwcvt_f(__riscv_vand(md, 31, vl), vl), 32, vl);
+                my = __riscv_vfdiv(__riscv_vfwcvt_f(__riscv_vand(__riscv_vsrl(md, 5, vl), 31, vl), vl), 32, vl);
+            }
+            else
+            {
+                auto imx = __riscv_vfcvt_x(__riscv_vfmul(mx, 32, vl), vl);
+                auto imy = __riscv_vfcvt_x(__riscv_vfmul(my, 32, vl), vl);
+                ix1 = __riscv_vsra(imx, 5, vl);
+                iy1 = __riscv_vsra(imy, 5, vl);
+                mx = __riscv_vfdiv(__riscv_vfcvt_f(__riscv_vand(imx, 31, vl), vl), 32, vl);
+                my = __riscv_vfdiv(__riscv_vfcvt_f(__riscv_vand(imy, 31, vl), vl), 32, vl);
+            }
+            auto ix0 = __riscv_vsub(ix1, 1, vl), iy0 = __riscv_vsub(iy1, 1, vl);
+            auto ix2 = __riscv_vadd(ix1, 1, vl), iy2 = __riscv_vadd(iy1, 1, vl);
+            auto ix3 = __riscv_vadd(ix1, 2, vl), iy3 = __riscv_vadd(iy1, 2, vl);
+
+            // the algorithm is copied from imgproc/src/imgwarp.cpp,
+            // in the function static void interpolateCubic
+            typename RVV_SameLen<float, helper>::VecType c0, c1, c2, c3;
+            auto intertab = [&](typename RVV_SameLen<float, helper>::VecType x) {
+                constexpr float A = -0.75f;
+                x = __riscv_vfadd(x, 1, vl);
+                c0 = __riscv_vfmadd(__riscv_vfmadd(__riscv_vfmadd(x, A, RVV_SameLen<float, helper>::vmv(-5 * A, vl), vl), x, RVV_SameLen<float, helper>::vmv(8 * A, vl), vl), x, RVV_SameLen<float, helper>::vmv(-4 * A, vl), vl);
+                x = __riscv_vfsub(x, 1, vl);
+                c1 = __riscv_vfmadd(__riscv_vfmul(__riscv_vfmadd(x, A + 2, RVV_SameLen<float, helper>::vmv(-(A + 3), vl), vl), x, vl), x, RVV_SameLen<float, helper>::vmv(1, vl), vl);
+                x = __riscv_vfrsub(x, 1, vl);
+                c2 = __riscv_vfmadd(__riscv_vfmul(__riscv_vfmadd(x, A + 2, RVV_SameLen<float, helper>::vmv(-(A + 3), vl), vl), x, vl), x, RVV_SameLen<float, helper>::vmv(1, vl), vl);
+                c3 = __riscv_vfsub(__riscv_vfsub(__riscv_vfrsub(c0, 1, vl), c1, vl), c2, vl);
+            };
+
+            intertab(mx);
+            auto v0 = rvv<helper>::vcvt0(access(ix0, iy0), vl);
+            auto v1 = rvv<helper>::vcvt0(access(ix1, iy0), vl);
+            auto v2 = rvv<helper>::vcvt0(access(ix2, iy0), vl);
+            auto v3 = rvv<helper>::vcvt0(access(ix3, iy0), vl);
+            auto k0 = __riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmul(v0, c0, vl), v1, c1, vl), v2, c2, vl), v3, c3, vl);
+            v0 = rvv<helper>::vcvt0(access(ix0, iy1), vl);
+            v1 = rvv<helper>::vcvt0(access(ix1, iy1), vl);
+            v2 = rvv<helper>::vcvt0(access(ix2, iy1), vl);
+            v3 = rvv<helper>::vcvt0(access(ix3, iy1), vl);
+            auto k1 = __riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmul(v0, c0, vl), v1, c1, vl), v2, c2, vl), v3, c3, vl);
+            v0 = rvv<helper>::vcvt0(access(ix0, iy2), vl);
+            v1 = rvv<helper>::vcvt0(access(ix1, iy2), vl);
+            v2 = rvv<helper>::vcvt0(access(ix2, iy2), vl);
+            v3 = rvv<helper>::vcvt0(access(ix3, iy2), vl);
+            auto k2 = __riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmul(v0, c0, vl), v1, c1, vl), v2, c2, vl), v3, c3, vl);
+            v0 = rvv<helper>::vcvt0(access(ix0, iy3), vl);
+            v1 = rvv<helper>::vcvt0(access(ix1, iy3), vl);
+            v2 = rvv<helper>::vcvt0(access(ix2, iy3), vl);
+            v3 = rvv<helper>::vcvt0(access(ix3, iy3), vl);
+            auto k3 = __riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmul(v0, c0, vl), v1, c1, vl), v2, c2, vl), v3, c3, vl);
+
+            intertab(my);
+            k0 = __riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmul(k0, c0, vl), k1, c1, vl), k2, c2, vl), k3, c3, vl);
+
+            helper::vstore(reinterpret_cast<T*>(dst_data + i * dst_step) + j, rvv<helper>::vcvt1(k0, vl), vl);
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+template<typename helper, bool s16>
+static inline int remap32fLanczos4(int start, int end, const uchar *src_data, size_t src_step, int src_width, int src_height,
+                                   uchar *dst_data, size_t dst_step, int dst_width,
+                                   const float* mapx, size_t mapx_step, const float* mapy, size_t mapy_step,
+                                   int interpolation, int border_type, const double* border_value)
+{
+    using T = typename helper::ElemType;
+
+    for (int i = start; i < end; i++)
+    {
+        int vl;
+        for (int j = 0; j < dst_width; j += vl)
+        {
+            vl = helper::setvl(dst_width - j);
+            typename RVV_SameLen<float, helper>::VecType mx, my;
+            if (s16)
+            {
+                auto map = __riscv_vlseg2e16_v_i16m1x2(reinterpret_cast<const short*>(mapx) + i * mapx_step + j * 2, vl);
+                mx = __riscv_vfwcvt_f(__riscv_vget_v_i16m1x2_i16m1(map, 0), vl);
+                my = __riscv_vfwcvt_f(__riscv_vget_v_i16m1x2_i16m1(map, 1), vl);
+            }
+            else
+            {
+                if (mapy == nullptr)
+                {
+                    auto map = __riscv_vlseg2e32_v_f32m2x2(mapx + i * mapx_step + j * 2, vl);
+                    mx = __riscv_vget_v_f32m2x2_f32m2(map, 0);
+                    my = __riscv_vget_v_f32m2x2_f32m2(map, 1);
+                }
+                else
+                {
+                    mx = RVV_SameLen<float, helper>::vload(mapx + i * mapx_step + j, vl);
+                    my = RVV_SameLen<float, helper>::vload(mapy + i * mapy_step + j, vl);
+                }
+            }
+            if (interpolation & CV_HAL_WARP_RELATIVE_MAP)
+            {
+                mx = __riscv_vfadd(mx, __riscv_vfcvt_f(__riscv_vadd(RVV_SameLen<uint, helper>::vid(vl), j, vl), vl), vl);
+                my = __riscv_vfadd(my, i, vl);
+            }
+
+            auto access = [&](typename RVV_SameLen<int, helper>::VecType ix, typename RVV_SameLen<int, helper>::VecType iy) {
+                auto ux = RVV_SameLen<uint, helper>::reinterpret(__riscv_vmin(__riscv_vmax(ix, 0, vl), src_width  - 1, vl));
+                auto uy = RVV_SameLen<uint, helper>::reinterpret(__riscv_vmin(__riscv_vmax(iy, 0, vl), src_height - 1, vl));
+                auto src = rvv<helper>::vloxei(reinterpret_cast<const T*>(src_data), __riscv_vmadd(uy, src_step, __riscv_vmul(ux, sizeof(T), vl), vl), vl);
+                if (border_type == CV_HAL_BORDER_CONSTANT)
+                {
+                    auto mask = __riscv_vmor(__riscv_vmsne(ix, RVV_SameLen<int, helper>::reinterpret(ux), vl), __riscv_vmsne(iy, RVV_SameLen<int, helper>::reinterpret(uy), vl), vl);
+                    src = __riscv_vmerge(src, helper::vmv(border_value[0], vl), mask, vl);
+                }
+                return src;
+            };
+
+            typename RVV_SameLen<int, helper>::VecType ix3, iy3;
+            typename RVV_SameLen<ushort, helper>::VecType imx, imy;
+            if (s16)
+            {
+                ix3 = __riscv_vfcvt_x(mx, vl);
+                iy3 = __riscv_vfcvt_x(my, vl);
+                auto md = __riscv_vle16_v_u16m1(reinterpret_cast<const ushort*>(mapy) + i * mapy_step + j, vl);
+                imx = __riscv_vand(md, 31, vl);
+                imy = __riscv_vand(__riscv_vsrl(md, 5, vl), 31, vl);
+            }
+            else
+            {
+                auto dmx = __riscv_vfcvt_x(__riscv_vfmul(mx, 32, vl), vl);
+                auto dmy = __riscv_vfcvt_x(__riscv_vfmul(my, 32, vl), vl);
+                ix3 = __riscv_vsra(dmx, 5, vl);
+                iy3 = __riscv_vsra(dmy, 5, vl);
+                imx = __riscv_vncvt_x(__riscv_vreinterpret_v_i32m2_u32m2(__riscv_vand(dmx, 31, vl)), vl);
+                imy = __riscv_vncvt_x(__riscv_vreinterpret_v_i32m2_u32m2(__riscv_vand(dmy, 31, vl)), vl);
+            }
+            auto ix0 = __riscv_vsub(ix3, 3, vl), iy0 = __riscv_vsub(iy3, 3, vl);
+            auto ix1 = __riscv_vsub(ix3, 2, vl), iy1 = __riscv_vsub(iy3, 2, vl);
+            auto ix2 = __riscv_vsub(ix3, 1, vl), iy2 = __riscv_vsub(iy3, 1, vl);
+            auto ix4 = __riscv_vadd(ix3, 1, vl), iy4 = __riscv_vadd(iy3, 1, vl);
+            auto ix5 = __riscv_vadd(ix3, 2, vl), iy5 = __riscv_vadd(iy3, 2, vl);
+            auto ix6 = __riscv_vadd(ix3, 3, vl), iy6 = __riscv_vadd(iy3, 3, vl);
+            auto ix7 = __riscv_vadd(ix3, 4, vl), iy7 = __riscv_vadd(iy3, 4, vl);
+
+            typename RVV_SameLen<float, helper>::VecType c0, c1, c2, c3, c4, c5, c6, c7;
+            auto intertab = [&](typename RVV_SameLen<ushort, helper>::VecType x) {
+                x = __riscv_vmul(x, sizeof(float) * 8, vl);
+                auto val = __riscv_vloxseg4ei16_v_f32m2x4(RemapTable::instance().coeffs, x, vl);
+                c0 = __riscv_vget_v_f32m2x4_f32m2(val, 0);
+                c1 = __riscv_vget_v_f32m2x4_f32m2(val, 1);
+                c2 = __riscv_vget_v_f32m2x4_f32m2(val, 2);
+                c3 = __riscv_vget_v_f32m2x4_f32m2(val, 3);
+                val = __riscv_vloxseg4ei16_v_f32m2x4(RemapTable::instance().coeffs, __riscv_vadd(x, sizeof(float) * 4, vl), vl);
+                c4 = __riscv_vget_v_f32m2x4_f32m2(val, 0);
+                c5 = __riscv_vget_v_f32m2x4_f32m2(val, 1);
+                c6 = __riscv_vget_v_f32m2x4_f32m2(val, 2);
+                c7 = __riscv_vget_v_f32m2x4_f32m2(val, 3);
+            };
+
+            intertab(imx);
+            auto v0 = rvv<helper>::vcvt0(access(ix0, iy0), vl);
+            auto v1 = rvv<helper>::vcvt0(access(ix1, iy0), vl);
+            auto v2 = rvv<helper>::vcvt0(access(ix2, iy0), vl);
+            auto v3 = rvv<helper>::vcvt0(access(ix3, iy0), vl);
+            auto v4 = rvv<helper>::vcvt0(access(ix4, iy0), vl);
+            auto v5 = rvv<helper>::vcvt0(access(ix5, iy0), vl);
+            auto v6 = rvv<helper>::vcvt0(access(ix6, iy0), vl);
+            auto v7 = rvv<helper>::vcvt0(access(ix7, iy0), vl);
+            auto k0 = __riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmul(v0, c0, vl), v1, c1, vl), v2, c2, vl), v3, c3, vl), v4, c4, vl), v5, c5, vl), v6, c6, vl), v7, c7, vl);
+            v0 = rvv<helper>::vcvt0(access(ix0, iy1), vl);
+            v1 = rvv<helper>::vcvt0(access(ix1, iy1), vl);
+            v2 = rvv<helper>::vcvt0(access(ix2, iy1), vl);
+            v3 = rvv<helper>::vcvt0(access(ix3, iy1), vl);
+            v4 = rvv<helper>::vcvt0(access(ix4, iy1), vl);
+            v5 = rvv<helper>::vcvt0(access(ix5, iy1), vl);
+            v6 = rvv<helper>::vcvt0(access(ix6, iy1), vl);
+            v7 = rvv<helper>::vcvt0(access(ix7, iy1), vl);
+            auto k1 = __riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmul(v0, c0, vl), v1, c1, vl), v2, c2, vl), v3, c3, vl), v4, c4, vl), v5, c5, vl), v6, c6, vl), v7, c7, vl);
+            v0 = rvv<helper>::vcvt0(access(ix0, iy2), vl);
+            v1 = rvv<helper>::vcvt0(access(ix1, iy2), vl);
+            v2 = rvv<helper>::vcvt0(access(ix2, iy2), vl);
+            v3 = rvv<helper>::vcvt0(access(ix3, iy2), vl);
+            v4 = rvv<helper>::vcvt0(access(ix4, iy2), vl);
+            v5 = rvv<helper>::vcvt0(access(ix5, iy2), vl);
+            v6 = rvv<helper>::vcvt0(access(ix6, iy2), vl);
+            v7 = rvv<helper>::vcvt0(access(ix7, iy2), vl);
+            auto k2 = __riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmul(v0, c0, vl), v1, c1, vl), v2, c2, vl), v3, c3, vl), v4, c4, vl), v5, c5, vl), v6, c6, vl), v7, c7, vl);
+            v0 = rvv<helper>::vcvt0(access(ix0, iy3), vl);
+            v1 = rvv<helper>::vcvt0(access(ix1, iy3), vl);
+            v2 = rvv<helper>::vcvt0(access(ix2, iy3), vl);
+            v3 = rvv<helper>::vcvt0(access(ix3, iy3), vl);
+            v4 = rvv<helper>::vcvt0(access(ix4, iy3), vl);
+            v5 = rvv<helper>::vcvt0(access(ix5, iy3), vl);
+            v6 = rvv<helper>::vcvt0(access(ix6, iy3), vl);
+            v7 = rvv<helper>::vcvt0(access(ix7, iy3), vl);
+            auto k3 = __riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmul(v0, c0, vl), v1, c1, vl), v2, c2, vl), v3, c3, vl), v4, c4, vl), v5, c5, vl), v6, c6, vl), v7, c7, vl);
+            v0 = rvv<helper>::vcvt0(access(ix0, iy4), vl);
+            v1 = rvv<helper>::vcvt0(access(ix1, iy4), vl);
+            v2 = rvv<helper>::vcvt0(access(ix2, iy4), vl);
+            v3 = rvv<helper>::vcvt0(access(ix3, iy4), vl);
+            v4 = rvv<helper>::vcvt0(access(ix4, iy4), vl);
+            v5 = rvv<helper>::vcvt0(access(ix5, iy4), vl);
+            v6 = rvv<helper>::vcvt0(access(ix6, iy4), vl);
+            v7 = rvv<helper>::vcvt0(access(ix7, iy4), vl);
+            auto k4 = __riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmul(v0, c0, vl), v1, c1, vl), v2, c2, vl), v3, c3, vl), v4, c4, vl), v5, c5, vl), v6, c6, vl), v7, c7, vl);
+            v0 = rvv<helper>::vcvt0(access(ix0, iy5), vl);
+            v1 = rvv<helper>::vcvt0(access(ix1, iy5), vl);
+            v2 = rvv<helper>::vcvt0(access(ix2, iy5), vl);
+            v3 = rvv<helper>::vcvt0(access(ix3, iy5), vl);
+            v4 = rvv<helper>::vcvt0(access(ix4, iy5), vl);
+            v5 = rvv<helper>::vcvt0(access(ix5, iy5), vl);
+            v6 = rvv<helper>::vcvt0(access(ix6, iy5), vl);
+            v7 = rvv<helper>::vcvt0(access(ix7, iy5), vl);
+            auto k5 = __riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmul(v0, c0, vl), v1, c1, vl), v2, c2, vl), v3, c3, vl), v4, c4, vl), v5, c5, vl), v6, c6, vl), v7, c7, vl);
+            v0 = rvv<helper>::vcvt0(access(ix0, iy6), vl);
+            v1 = rvv<helper>::vcvt0(access(ix1, iy6), vl);
+            v2 = rvv<helper>::vcvt0(access(ix2, iy6), vl);
+            v3 = rvv<helper>::vcvt0(access(ix3, iy6), vl);
+            v4 = rvv<helper>::vcvt0(access(ix4, iy6), vl);
+            v5 = rvv<helper>::vcvt0(access(ix5, iy6), vl);
+            v6 = rvv<helper>::vcvt0(access(ix6, iy6), vl);
+            v7 = rvv<helper>::vcvt0(access(ix7, iy6), vl);
+            auto k6 = __riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmul(v0, c0, vl), v1, c1, vl), v2, c2, vl), v3, c3, vl), v4, c4, vl), v5, c5, vl), v6, c6, vl), v7, c7, vl);
+            v0 = rvv<helper>::vcvt0(access(ix0, iy7), vl);
+            v1 = rvv<helper>::vcvt0(access(ix1, iy7), vl);
+            v2 = rvv<helper>::vcvt0(access(ix2, iy7), vl);
+            v3 = rvv<helper>::vcvt0(access(ix3, iy7), vl);
+            v4 = rvv<helper>::vcvt0(access(ix4, iy7), vl);
+            v5 = rvv<helper>::vcvt0(access(ix5, iy7), vl);
+            v6 = rvv<helper>::vcvt0(access(ix6, iy7), vl);
+            v7 = rvv<helper>::vcvt0(access(ix7, iy7), vl);
+            auto k7 = __riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmul(v0, c0, vl), v1, c1, vl), v2, c2, vl), v3, c3, vl), v4, c4, vl), v5, c5, vl), v6, c6, vl), v7, c7, vl);
+
+            intertab(imy);
+            k0 = __riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmul(k0, c0, vl), k1, c1, vl), k2, c2, vl), k3, c3, vl), k4, c4, vl), k5, c5, vl), k6, c6, vl), k7, c7, vl);
+
+            helper::vstore(reinterpret_cast<T*>(dst_data + i * dst_step) + j, rvv<helper>::vcvt1(k0, vl), vl);
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+static inline int remap32fC3(int start, int end, const uchar *src_data, size_t src_step, int src_width, int src_height,
+                             uchar *dst_data, size_t dst_step, int dst_width,
+                             const float* mapx, size_t mapx_step, const float* mapy, size_t mapy_step,
+                             int interpolation, int border_type, const double* border_value)
+{
+    for (int i = start; i < end; i++)
+    {
+        int vl;
+        for (int j = 0; j < dst_width; j += vl)
+        {
+            vl = __riscv_vsetvl_e8mf2(dst_width - j);
+            vfloat32m2_t mx, my;
+            if (mapy == nullptr)
+            {
+                auto map = __riscv_vlseg2e32_v_f32m2x2(mapx + i * mapx_step + j * 2, vl);
+                mx = __riscv_vget_v_f32m2x2_f32m2(map, 0);
+                my = __riscv_vget_v_f32m2x2_f32m2(map, 1);
+            }
+            else
+            {
+                mx = __riscv_vle32_v_f32m2(mapx + i * mapx_step + j, vl);
+                my = __riscv_vle32_v_f32m2(mapy + i * mapy_step + j, vl);
+            }
+            if (interpolation & CV_HAL_WARP_RELATIVE_MAP)
+            {
+                mx = __riscv_vfadd(mx, __riscv_vfcvt_f(__riscv_vadd(__riscv_vid_v_u32m2(vl), j, vl), vl), vl);
+                my = __riscv_vfadd(my, i, vl);
+            }
+
+            auto access = [&](vint32m2_t ix, vint32m2_t iy, vuint8mf2_t& src0, vuint8mf2_t& src1, vuint8mf2_t& src2) {
+                auto ux = __riscv_vreinterpret_v_i32m2_u32m2(__riscv_vmin(__riscv_vmax(ix, 0, vl), src_width  - 1, vl));
+                auto uy = __riscv_vreinterpret_v_i32m2_u32m2(__riscv_vmin(__riscv_vmax(iy, 0, vl), src_height - 1, vl));
+                auto src = __riscv_vloxseg3ei32_v_u8mf2x3(src_data, __riscv_vmadd(uy, src_step, __riscv_vmul(ux, 3, vl), vl), vl);
+                src0 = __riscv_vget_v_u8mf2x3_u8mf2(src, 0);
+                src1 = __riscv_vget_v_u8mf2x3_u8mf2(src, 1);
+                src2 = __riscv_vget_v_u8mf2x3_u8mf2(src, 2);
+                if (border_type == CV_HAL_BORDER_CONSTANT)
+                {
+                    auto mask = __riscv_vmor(__riscv_vmsne(ix, __riscv_vreinterpret_v_u32m2_i32m2(ux), vl), __riscv_vmsne(iy, __riscv_vreinterpret_v_u32m2_i32m2(uy), vl), vl);
+                    src0 = __riscv_vmerge(src0, border_value[0], mask, vl);
+                    src1 = __riscv_vmerge(src1, border_value[1], mask, vl);
+                    src2 = __riscv_vmerge(src2, border_value[2], mask, vl);
+                }
+            };
+            if ((interpolation & ~CV_HAL_WARP_RELATIVE_MAP) == CV_HAL_INTER_NEAREST)
+            {
+                auto ix = __riscv_vfcvt_x(mx, vl), iy = __riscv_vfcvt_x(my, vl);
+                vuint8mf2_t src0, src1, src2;
+                access(ix, iy, src0, src1, src2);
+                vuint8mf2x3_t dst{};
+                dst = __riscv_vset_v_u8mf2_u8mf2x3(dst, 0, src0);
+                dst = __riscv_vset_v_u8mf2_u8mf2x3(dst, 1, src1);
+                dst = __riscv_vset_v_u8mf2_u8mf2x3(dst, 2, src2);
+                __riscv_vsseg3e8(dst_data + i * dst_step + j * 3, dst, vl);
+            }
+            else
+            {
+                auto imx = __riscv_vfcvt_x(__riscv_vfmul(mx, 32, vl), vl);
+                auto imy = __riscv_vfcvt_x(__riscv_vfmul(my, 32, vl), vl);
+                auto ix0 = __riscv_vsra(imx, 5, vl);
+                auto iy0 = __riscv_vsra(imy, 5, vl);
+                auto ix1 = __riscv_vadd(ix0, 1, vl), iy1 = __riscv_vadd(iy0, 1, vl);
+                mx = __riscv_vfdiv(__riscv_vfcvt_f(__riscv_vand(imx, 31, vl), vl), 32, vl);
+                my = __riscv_vfdiv(__riscv_vfcvt_f(__riscv_vand(imy, 31, vl), vl), 32, vl);
+
+                vfloat32m2_t v00, v10, v20;
+                vfloat32m2_t v01, v11, v21;
+                vfloat32m2_t v02, v12, v22;
+                vfloat32m2_t v03, v13, v23;
+                vuint8mf2_t src0, src1, src2;
+                access(ix0, iy0, src0, src1, src2);
+                v00 = __riscv_vfcvt_f(__riscv_vzext_vf4(src0, vl), vl);
+                v10 = __riscv_vfcvt_f(__riscv_vzext_vf4(src1, vl), vl);
+                v20 = __riscv_vfcvt_f(__riscv_vzext_vf4(src2, vl), vl);
+                access(ix1, iy0, src0, src1, src2);
+                v01 = __riscv_vfcvt_f(__riscv_vzext_vf4(src0, vl), vl);
+                v11 = __riscv_vfcvt_f(__riscv_vzext_vf4(src1, vl), vl);
+                v21 = __riscv_vfcvt_f(__riscv_vzext_vf4(src2, vl), vl);
+                access(ix0, iy1, src0, src1, src2);
+                v02 = __riscv_vfcvt_f(__riscv_vzext_vf4(src0, vl), vl);
+                v12 = __riscv_vfcvt_f(__riscv_vzext_vf4(src1, vl), vl);
+                v22 = __riscv_vfcvt_f(__riscv_vzext_vf4(src2, vl), vl);
+                access(ix1, iy1, src0, src1, src2);
+                v03 = __riscv_vfcvt_f(__riscv_vzext_vf4(src0, vl), vl);
+                v13 = __riscv_vfcvt_f(__riscv_vzext_vf4(src1, vl), vl);
+                v23 = __riscv_vfcvt_f(__riscv_vzext_vf4(src2, vl), vl);
+
+                v00 = __riscv_vfmacc(v00, mx, __riscv_vfsub(v01, v00, vl), vl);
+                v02 = __riscv_vfmacc(v02, mx, __riscv_vfsub(v03, v02, vl), vl);
+                v00 = __riscv_vfmacc(v00, my, __riscv_vfsub(v02, v00, vl), vl);
+                v10 = __riscv_vfmacc(v10, mx, __riscv_vfsub(v11, v10, vl), vl);
+                v12 = __riscv_vfmacc(v12, mx, __riscv_vfsub(v13, v12, vl), vl);
+                v10 = __riscv_vfmacc(v10, my, __riscv_vfsub(v12, v10, vl), vl);
+                v20 = __riscv_vfmacc(v20, mx, __riscv_vfsub(v21, v20, vl), vl);
+                v22 = __riscv_vfmacc(v22, mx, __riscv_vfsub(v23, v22, vl), vl);
+                v20 = __riscv_vfmacc(v20, my, __riscv_vfsub(v22, v20, vl), vl);
+                vuint8mf2x3_t dst{};
+                dst = __riscv_vset_v_u8mf2_u8mf2x3(dst, 0, __riscv_vnclipu(__riscv_vfncvt_xu(v00, vl), 0, __RISCV_VXRM_RNU, vl));
+                dst = __riscv_vset_v_u8mf2_u8mf2x3(dst, 1, __riscv_vnclipu(__riscv_vfncvt_xu(v10, vl), 0, __RISCV_VXRM_RNU, vl));
+                dst = __riscv_vset_v_u8mf2_u8mf2x3(dst, 2, __riscv_vnclipu(__riscv_vfncvt_xu(v20, vl), 0, __RISCV_VXRM_RNU, vl));
+                __riscv_vsseg3e8(dst_data + i * dst_step + j * 3, dst, vl);
+            }
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+static inline int remap32fC4(int start, int end, const uchar *src_data, size_t src_step, int src_width, int src_height,
+                             uchar *dst_data, size_t dst_step, int dst_width,
+                             const float* mapx, size_t mapx_step, const float* mapy, size_t mapy_step,
+                             int interpolation, int border_type, const double* border_value)
+{
+    for (int i = start; i < end; i++)
+    {
+        int vl;
+        for (int j = 0; j < dst_width; j += vl)
+        {
+            vl = __riscv_vsetvl_e8mf2(dst_width - j);
+            vfloat32m2_t mx, my;
+            if (mapy == nullptr)
+            {
+                auto map = __riscv_vlseg2e32_v_f32m2x2(mapx + i * mapx_step + j * 2, vl);
+                mx = __riscv_vget_v_f32m2x2_f32m2(map, 0);
+                my = __riscv_vget_v_f32m2x2_f32m2(map, 1);
+            }
+            else
+            {
+                mx = __riscv_vle32_v_f32m2(mapx + i * mapx_step + j, vl);
+                my = __riscv_vle32_v_f32m2(mapy + i * mapy_step + j, vl);
+            }
+            if (interpolation & CV_HAL_WARP_RELATIVE_MAP)
+            {
+                mx = __riscv_vfadd(mx, __riscv_vfcvt_f(__riscv_vadd(__riscv_vid_v_u32m2(vl), j, vl), vl), vl);
+                my = __riscv_vfadd(my, i, vl);
+            }
+
+            auto access = [&](vint32m2_t ix, vint32m2_t iy, vuint8mf2_t& src0, vuint8mf2_t& src1, vuint8mf2_t& src2, vuint8mf2_t& src3) {
+                auto ux = __riscv_vreinterpret_v_i32m2_u32m2(__riscv_vmin(__riscv_vmax(ix, 0, vl), src_width  - 1, vl));
+                auto uy = __riscv_vreinterpret_v_i32m2_u32m2(__riscv_vmin(__riscv_vmax(iy, 0, vl), src_height - 1, vl));
+                auto src = __riscv_vloxseg4ei32_v_u8mf2x4(src_data, __riscv_vmadd(uy, src_step, __riscv_vmul(ux, 4, vl), vl), vl);
+                src0 = __riscv_vget_v_u8mf2x4_u8mf2(src, 0);
+                src1 = __riscv_vget_v_u8mf2x4_u8mf2(src, 1);
+                src2 = __riscv_vget_v_u8mf2x4_u8mf2(src, 2);
+                src3 = __riscv_vget_v_u8mf2x4_u8mf2(src, 3);
+                if (border_type == CV_HAL_BORDER_CONSTANT)
+                {
+                    auto mask = __riscv_vmor(__riscv_vmsne(ix, __riscv_vreinterpret_v_u32m2_i32m2(ux), vl), __riscv_vmsne(iy, __riscv_vreinterpret_v_u32m2_i32m2(uy), vl), vl);
+                    src0 = __riscv_vmerge(src0, border_value[0], mask, vl);
+                    src1 = __riscv_vmerge(src1, border_value[1], mask, vl);
+                    src2 = __riscv_vmerge(src2, border_value[2], mask, vl);
+                    src3 = __riscv_vmerge(src3, border_value[3], mask, vl);
+                }
+            };
+            if ((interpolation & ~CV_HAL_WARP_RELATIVE_MAP) == CV_HAL_INTER_NEAREST)
+            {
+                auto ix = __riscv_vfcvt_x(mx, vl), iy = __riscv_vfcvt_x(my, vl);
+                vuint8mf2_t src0, src1, src2, src3;
+                access(ix, iy, src0, src1, src2, src3);
+                vuint8mf2x4_t dst{};
+                dst = __riscv_vset_v_u8mf2_u8mf2x4(dst, 0, src0);
+                dst = __riscv_vset_v_u8mf2_u8mf2x4(dst, 1, src1);
+                dst = __riscv_vset_v_u8mf2_u8mf2x4(dst, 2, src2);
+                dst = __riscv_vset_v_u8mf2_u8mf2x4(dst, 3, src3);
+                __riscv_vsseg4e8(dst_data + i * dst_step + j * 4, dst, vl);
+            }
+            else
+            {
+                auto imx = __riscv_vfcvt_x(__riscv_vfmul(mx, 32, vl), vl);
+                auto imy = __riscv_vfcvt_x(__riscv_vfmul(my, 32, vl), vl);
+                auto ix0 = __riscv_vsra(imx, 5, vl);
+                auto iy0 = __riscv_vsra(imy, 5, vl);
+                auto ix1 = __riscv_vadd(ix0, 1, vl), iy1 = __riscv_vadd(iy0, 1, vl);
+                mx = __riscv_vfdiv(__riscv_vfcvt_f(__riscv_vand(imx, 31, vl), vl), 32, vl);
+                my = __riscv_vfdiv(__riscv_vfcvt_f(__riscv_vand(imy, 31, vl), vl), 32, vl);
+
+                vfloat32m2_t v00, v10, v20, v30;
+                vfloat32m2_t v01, v11, v21, v31;
+                vfloat32m2_t v02, v12, v22, v32;
+                vfloat32m2_t v03, v13, v23, v33;
+                vuint8mf2_t src0, src1, src2, src3;
+                access(ix0, iy0, src0, src1, src2, src3);
+                v00 = __riscv_vfcvt_f(__riscv_vzext_vf4(src0, vl), vl);
+                v10 = __riscv_vfcvt_f(__riscv_vzext_vf4(src1, vl), vl);
+                v20 = __riscv_vfcvt_f(__riscv_vzext_vf4(src2, vl), vl);
+                v30 = __riscv_vfcvt_f(__riscv_vzext_vf4(src3, vl), vl);
+                access(ix1, iy0, src0, src1, src2, src3);
+                v01 = __riscv_vfcvt_f(__riscv_vzext_vf4(src0, vl), vl);
+                v11 = __riscv_vfcvt_f(__riscv_vzext_vf4(src1, vl), vl);
+                v21 = __riscv_vfcvt_f(__riscv_vzext_vf4(src2, vl), vl);
+                v31 = __riscv_vfcvt_f(__riscv_vzext_vf4(src3, vl), vl);
+                access(ix0, iy1, src0, src1, src2, src3);
+                v02 = __riscv_vfcvt_f(__riscv_vzext_vf4(src0, vl), vl);
+                v12 = __riscv_vfcvt_f(__riscv_vzext_vf4(src1, vl), vl);
+                v22 = __riscv_vfcvt_f(__riscv_vzext_vf4(src2, vl), vl);
+                v32 = __riscv_vfcvt_f(__riscv_vzext_vf4(src3, vl), vl);
+                access(ix1, iy1, src0, src1, src2, src3);
+                v03 = __riscv_vfcvt_f(__riscv_vzext_vf4(src0, vl), vl);
+                v13 = __riscv_vfcvt_f(__riscv_vzext_vf4(src1, vl), vl);
+                v23 = __riscv_vfcvt_f(__riscv_vzext_vf4(src2, vl), vl);
+                v33 = __riscv_vfcvt_f(__riscv_vzext_vf4(src3, vl), vl);
+
+                v00 = __riscv_vfmacc(v00, mx, __riscv_vfsub(v01, v00, vl), vl);
+                v02 = __riscv_vfmacc(v02, mx, __riscv_vfsub(v03, v02, vl), vl);
+                v00 = __riscv_vfmacc(v00, my, __riscv_vfsub(v02, v00, vl), vl);
+                v10 = __riscv_vfmacc(v10, mx, __riscv_vfsub(v11, v10, vl), vl);
+                v12 = __riscv_vfmacc(v12, mx, __riscv_vfsub(v13, v12, vl), vl);
+                v10 = __riscv_vfmacc(v10, my, __riscv_vfsub(v12, v10, vl), vl);
+                v20 = __riscv_vfmacc(v20, mx, __riscv_vfsub(v21, v20, vl), vl);
+                v22 = __riscv_vfmacc(v22, mx, __riscv_vfsub(v23, v22, vl), vl);
+                v20 = __riscv_vfmacc(v20, my, __riscv_vfsub(v22, v20, vl), vl);
+                v30 = __riscv_vfmacc(v30, mx, __riscv_vfsub(v31, v30, vl), vl);
+                v32 = __riscv_vfmacc(v32, mx, __riscv_vfsub(v33, v32, vl), vl);
+                v30 = __riscv_vfmacc(v30, my, __riscv_vfsub(v32, v30, vl), vl);
+                vuint8mf2x4_t dst{};
+                dst = __riscv_vset_v_u8mf2_u8mf2x4(dst, 0, __riscv_vnclipu(__riscv_vfncvt_xu(v00, vl), 0, __RISCV_VXRM_RNU, vl));
+                dst = __riscv_vset_v_u8mf2_u8mf2x4(dst, 1, __riscv_vnclipu(__riscv_vfncvt_xu(v10, vl), 0, __RISCV_VXRM_RNU, vl));
+                dst = __riscv_vset_v_u8mf2_u8mf2x4(dst, 2, __riscv_vnclipu(__riscv_vfncvt_xu(v20, vl), 0, __RISCV_VXRM_RNU, vl));
+                dst = __riscv_vset_v_u8mf2_u8mf2x4(dst, 3, __riscv_vnclipu(__riscv_vfncvt_xu(v30, vl), 0, __RISCV_VXRM_RNU, vl));
+                __riscv_vsseg4e8(dst_data + i * dst_step + j * 4, dst, vl);
+            }
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+// the algorithm is copied from 3rdparty/carotene/src/remap.cpp,
+// in the function void CAROTENE_NS::remapNearestNeighbor and void CAROTENE_NS::remapLinear
+template<bool s16 = false>
+inline int remap32f(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height,
+                    uchar *dst_data, size_t dst_step, int dst_width, int dst_height,
+                    float* mapx, size_t mapx_step, float* mapy, size_t mapy_step,
+                    int interpolation, int border_type, const double border_value[4])
+{
+    if (src_type != CV_8UC1 && src_type != CV_8UC3 && src_type != CV_8UC4 && src_type != CV_16UC1 && src_type != CV_16SC1 && src_type != CV_32FC1)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    if (border_type != CV_HAL_BORDER_CONSTANT && border_type != CV_HAL_BORDER_REPLICATE)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    const int mode = interpolation & ~CV_HAL_WARP_RELATIVE_MAP;
+    if (mode != CV_HAL_INTER_NEAREST && mode != CV_HAL_INTER_LINEAR && mode != CV_HAL_INTER_CUBIC && mode != CV_HAL_INTER_LANCZOS4)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    if ((mode == CV_HAL_INTER_CUBIC || mode == CV_HAL_INTER_LANCZOS4) && CV_MAKETYPE(src_type, 1) != src_type)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    mapx_step /= s16 ? sizeof(short) : sizeof(float);
+    mapy_step /= s16 ? sizeof(ushort) : sizeof(float);
+    switch (src_type)
+    {
+    case CV_8UC3:
+        return invoke(dst_width, dst_height, {remap32fC3}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, mapx, mapx_step, mapy, mapy_step, interpolation, border_type, border_value);
+    case CV_8UC4:
+        return invoke(dst_width, dst_height, {remap32fC4}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, mapx, mapx_step, mapy, mapy_step, interpolation, border_type, border_value);
+    }
+    switch (mode*100 + src_type)
+    {
+    case CV_HAL_INTER_NEAREST*100 + CV_8UC1:
+    case CV_HAL_INTER_LINEAR*100 + CV_8UC1:
+        return invoke(dst_width, dst_height, {remap32fC1<RVV_U8M2>}, s16, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, mapx, mapx_step, mapy, mapy_step, interpolation, border_type, border_value);
+    case CV_HAL_INTER_NEAREST*100 + CV_16UC1:
+    case CV_HAL_INTER_LINEAR*100 + CV_16UC1:
+        return invoke(dst_width, dst_height, {remap32fC1<RVV_U16M4>}, s16, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, mapx, mapx_step, mapy, mapy_step, interpolation, border_type, border_value);
+    case CV_HAL_INTER_NEAREST*100 + CV_16SC1:
+    case CV_HAL_INTER_LINEAR*100 + CV_16SC1:
+        return invoke(dst_width, dst_height, {remap32fC1<RVV_I16M4>}, s16, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, mapx, mapx_step, mapy, mapy_step, interpolation, border_type, border_value);
+    case CV_HAL_INTER_NEAREST*100 + CV_32FC1:
+    case CV_HAL_INTER_LINEAR*100 + CV_32FC1:
+        return invoke(dst_width, dst_height, {remap32fC1<RVV_F32M8>}, s16, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, mapx, mapx_step, mapy, mapy_step, interpolation, border_type, border_value);
+    
+    case CV_HAL_INTER_CUBIC*100 + CV_8UC1:
+        return invoke(dst_width, dst_height, {remap32fCubic<RVV_U8MF4>}, s16, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, mapx, mapx_step, mapy, mapy_step, interpolation, border_type, border_value);
+    case CV_HAL_INTER_CUBIC*100 + CV_16UC1:
+        return invoke(dst_width, dst_height, {remap32fCubic<RVV_U16MF2>}, s16, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, mapx, mapx_step, mapy, mapy_step, interpolation, border_type, border_value);
+    case CV_HAL_INTER_CUBIC*100 + CV_16SC1:
+        return invoke(dst_width, dst_height, {remap32fCubic<RVV_I16MF2>}, s16, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, mapx, mapx_step, mapy, mapy_step, interpolation, border_type, border_value);
+    case CV_HAL_INTER_CUBIC*100 + CV_32FC1:
+        return invoke(dst_width, dst_height, {remap32fCubic<RVV_F32M1>}, s16, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, mapx, mapx_step, mapy, mapy_step, interpolation, border_type, border_value);
+
+    // Lanczos4 is disabled in clang since register allocation strategy is buggy in clang 20.0
+    // remove this #ifndef in the future if possible
+#ifndef __clang__
+    case CV_HAL_INTER_LANCZOS4*100 + CV_8UC1:
+        return invoke(dst_width, dst_height, {remap32fLanczos4<RVV_U8MF2, s16>}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, mapx, mapx_step, mapy, mapy_step, interpolation, border_type, border_value);
+    // disabled since UI is fast enough
+    // case CV_HAL_INTER_LANCZOS4*100 + CV_16UC1:
+    //     return invoke(dst_width, dst_height, {remap32fLanczos4<RVV_U16M1, s16>}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, mapx, mapx_step, mapy, mapy_step, interpolation, border_type, border_value);
+    case CV_HAL_INTER_LANCZOS4*100 + CV_16SC1:
+        return invoke(dst_width, dst_height, {remap32fLanczos4<RVV_I16M1, s16>}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, mapx, mapx_step, mapy, mapy_step, interpolation, border_type, border_value);
+    case CV_HAL_INTER_LANCZOS4*100 + CV_32FC1:
+        return invoke(dst_width, dst_height, {remap32fLanczos4<RVV_F32M2, s16>}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, mapx, mapx_step, mapy, mapy_step, interpolation, border_type, border_value);
+#endif
+    }
+
+    return CV_HAL_ERROR_NOT_IMPLEMENTED;
+}
+
+inline int remap32fc2(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height,
+                      uchar *dst_data, size_t dst_step, int dst_width, int dst_height,
+                      float* map, size_t map_step, int interpolation, int border_type, const double border_value[4])
+{
+    return remap32f(src_type, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, dst_height, map, map_step, nullptr, 0, interpolation, border_type, border_value);
+}
+
+inline int remap16s(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height,
+                    uchar *dst_data, size_t dst_step, int dst_width, int dst_height,
+                    short* mapx, size_t mapx_step, ushort* mapy, size_t mapy_step,
+                    int interpolation, int border_type, const double border_value[4])
+{
+    if (CV_MAKETYPE(src_type, 1) != src_type)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    return remap32f<true>(src_type, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, dst_height, reinterpret_cast<float*>(mapx), mapx_step, reinterpret_cast<float*>(mapy), mapy_step, interpolation, border_type, border_value);
+}
+} // cv::cv_hal_rvv::remap
+
+namespace warp {
+#undef cv_hal_warpAffine
+#define cv_hal_warpAffine cv::cv_hal_rvv::warp::warpAffine
+#undef cv_hal_warpPerspective
+#define cv_hal_warpPerspective cv::cv_hal_rvv::warp::warpPerspective
+
+template<bool perspective>
+static inline int warpC1(int start, int end, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, const double* M, int interpolation, int borderType, const double* borderValue)
+{
+    for (int i = start; i < end; i++)
+    {
+        int vl;
+        for (int j = 0; j < dst_width; j += vl)
+        {
+            vl = __riscv_vsetvl_e8m1(dst_width - j);
+            auto access = [&](vint32m4_t ix, vint32m4_t iy) {
+                auto ux = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vmin(__riscv_vmax(ix, 0, vl), src_width  - 1, vl));
+                auto uy = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vmin(__riscv_vmax(iy, 0, vl), src_height - 1, vl));
+                auto src = __riscv_vloxei32_v_u8m1(src_data, __riscv_vmadd(uy, src_step, ux, vl), vl);
+                if (borderType == CV_HAL_BORDER_CONSTANT)
+                {
+                    auto mask = __riscv_vmor(__riscv_vmsne(ix, __riscv_vreinterpret_v_u32m4_i32m4(ux), vl), __riscv_vmsne(iy, __riscv_vreinterpret_v_u32m4_i32m4(uy), vl), vl);
+                    src = __riscv_vmerge(src, borderValue[0], mask, vl);
+                }
+                return src;
+            };
+
+            auto id = __riscv_vfcvt_f(__riscv_vadd(__riscv_vid_v_u32m4(vl), j, vl), vl);
+            auto mx = __riscv_vfmadd(id, M[0], __riscv_vfmadd(__riscv_vfmv_v_f_f32m4(i, vl), M[1], __riscv_vfmv_v_f_f32m4(M[2], vl), vl), vl);
+            auto my = __riscv_vfmadd(id, M[3], __riscv_vfmadd(__riscv_vfmv_v_f_f32m4(i, vl), M[4], __riscv_vfmv_v_f_f32m4(M[5], vl), vl), vl);
+            if (perspective)
+            {
+                auto md = __riscv_vfrdiv(__riscv_vfmadd(id, M[6], __riscv_vfmadd(__riscv_vfmv_v_f_f32m4(i, vl), M[7], __riscv_vfmv_v_f_f32m4(M[8], vl), vl), vl), 1, vl);
+                mx = __riscv_vfmul(mx, md, vl);
+                my = __riscv_vfmul(my, md, vl);
+            }
+
+            if (interpolation == CV_HAL_INTER_NEAREST)
+            {
+                auto ix = __riscv_vfcvt_x(mx, vl), iy = __riscv_vfcvt_x(my, vl);
+                __riscv_vse8(dst_data + i * dst_step + j, access(ix, iy), vl);
+            }
+            else
+            {
+                auto ix = __riscv_vfcvt_x(__riscv_vfmadd(mx, 1 << 10, __riscv_vfmv_v_f_f32m4(1 << 4, vl), vl), vl);
+                auto iy = __riscv_vfcvt_x(__riscv_vfmadd(my, 1 << 10, __riscv_vfmv_v_f_f32m4(1 << 4, vl), vl), vl);
+                auto ix0 = __riscv_vsra(ix, 10, vl), iy0 = __riscv_vsra(iy, 10, vl);
+                auto ix1 = __riscv_vadd(ix0, 1, vl), iy1 = __riscv_vadd(iy0, 1, vl);
+
+                auto v0 = __riscv_vzext_vf4(access(ix0, iy0), vl);
+                auto v1 = __riscv_vzext_vf4(access(ix1, iy0), vl);
+                auto v2 = __riscv_vzext_vf4(access(ix0, iy1), vl);
+                auto v3 = __riscv_vzext_vf4(access(ix1, iy1), vl);
+
+                auto rx = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vand(__riscv_vsra(ix, 5, vl), (1 << 5) - 1, vl));
+                auto ry = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vand(__riscv_vsra(iy, 5, vl), (1 << 5) - 1, vl));
+                v0 = __riscv_vmacc(__riscv_vmul(v0, 1 << 5, vl), rx, __riscv_vsub(v1, v0, vl), vl);
+                v2 = __riscv_vmacc(__riscv_vmul(v2, 1 << 5, vl), rx, __riscv_vsub(v3, v2, vl), vl);
+                v0 = __riscv_vmacc(__riscv_vmul(v0, 1 << 5, vl), ry, __riscv_vsub(v2, v0, vl), vl);
+                __riscv_vse8(dst_data + i * dst_step + j, __riscv_vnclipu(__riscv_vnclipu(v0, 10, __RISCV_VXRM_RNU, vl), 0, __RISCV_VXRM_RNU, vl), vl);
+            }
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+template<bool perspective>
+static inline int warpC3(int start, int end, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, const double* M, int interpolation, int borderType, const double* borderValue)
+{
+    for (int i = start; i < end; i++)
+    {
+        int vl;
+        for (int j = 0; j < dst_width; j += vl)
+        {
+            vl = __riscv_vsetvl_e8mf2(dst_width - j);
+            auto access = [&](vint32m2_t ix, vint32m2_t iy, vuint8mf2_t& src0, vuint8mf2_t& src1, vuint8mf2_t& src2) {
+                auto ux = __riscv_vreinterpret_v_i32m2_u32m2(__riscv_vmin(__riscv_vmax(ix, 0, vl), src_width  - 1, vl));
+                auto uy = __riscv_vreinterpret_v_i32m2_u32m2(__riscv_vmin(__riscv_vmax(iy, 0, vl), src_height - 1, vl));
+                auto src = __riscv_vloxseg3ei32_v_u8mf2x3(src_data, __riscv_vmadd(uy, src_step, __riscv_vmul(ux, 3, vl), vl), vl);
+                src0 = __riscv_vget_v_u8mf2x3_u8mf2(src, 0);
+                src1 = __riscv_vget_v_u8mf2x3_u8mf2(src, 1);
+                src2 = __riscv_vget_v_u8mf2x3_u8mf2(src, 2);
+                if (borderType == CV_HAL_BORDER_CONSTANT)
+                {
+                    auto mask = __riscv_vmor(__riscv_vmsne(ix, __riscv_vreinterpret_v_u32m2_i32m2(ux), vl), __riscv_vmsne(iy, __riscv_vreinterpret_v_u32m2_i32m2(uy), vl), vl);
+                    src0 = __riscv_vmerge(src0, borderValue[0], mask, vl);
+                    src1 = __riscv_vmerge(src1, borderValue[1], mask, vl);
+                    src2 = __riscv_vmerge(src2, borderValue[2], mask, vl);
+                }
+            };
+
+            auto id = __riscv_vfcvt_f(__riscv_vadd(__riscv_vid_v_u32m2(vl), j, vl), vl);
+            auto mx = __riscv_vfmadd(id, M[0], __riscv_vfmadd(__riscv_vfmv_v_f_f32m2(i, vl), M[1], __riscv_vfmv_v_f_f32m2(M[2], vl), vl), vl);
+            auto my = __riscv_vfmadd(id, M[3], __riscv_vfmadd(__riscv_vfmv_v_f_f32m2(i, vl), M[4], __riscv_vfmv_v_f_f32m2(M[5], vl), vl), vl);
+            if (perspective)
+            {
+                auto md = __riscv_vfrdiv(__riscv_vfmadd(id, M[6], __riscv_vfmadd(__riscv_vfmv_v_f_f32m2(i, vl), M[7], __riscv_vfmv_v_f_f32m2(M[8], vl), vl), vl), 1, vl);
+                mx = __riscv_vfmul(mx, md, vl);
+                my = __riscv_vfmul(my, md, vl);
+            }
+
+            if (interpolation == CV_HAL_INTER_NEAREST)
+            {
+                auto ix = __riscv_vfcvt_x(mx, vl), iy = __riscv_vfcvt_x(my, vl);
+                vuint8mf2_t src0, src1, src2;
+                access(ix, iy, src0, src1, src2);
+                vuint8mf2x3_t dst{};
+                dst = __riscv_vset_v_u8mf2_u8mf2x3(dst, 0, src0);
+                dst = __riscv_vset_v_u8mf2_u8mf2x3(dst, 1, src1);
+                dst = __riscv_vset_v_u8mf2_u8mf2x3(dst, 2, src2);
+                __riscv_vsseg3e8(dst_data + i * dst_step + j * 3, dst, vl);
+            }
+            else
+            {
+                auto ix = __riscv_vfcvt_x(__riscv_vfmadd(mx, 1 << 10, __riscv_vfmv_v_f_f32m2(1 << 4, vl), vl), vl);
+                auto iy = __riscv_vfcvt_x(__riscv_vfmadd(my, 1 << 10, __riscv_vfmv_v_f_f32m2(1 << 4, vl), vl), vl);
+                auto ix0 = __riscv_vsra(ix, 10, vl), iy0 = __riscv_vsra(iy, 10, vl);
+                auto ix1 = __riscv_vadd(ix0, 1, vl), iy1 = __riscv_vadd(iy0, 1, vl);
+
+                vuint32m2_t v00, v10, v20;
+                vuint32m2_t v01, v11, v21;
+                vuint32m2_t v02, v12, v22;
+                vuint32m2_t v03, v13, v23;
+                vuint8mf2_t src0, src1, src2;
+                access(ix0, iy0, src0, src1, src2);
+                v00 = __riscv_vzext_vf4(src0, vl);
+                v10 = __riscv_vzext_vf4(src1, vl);
+                v20 = __riscv_vzext_vf4(src2, vl);
+                access(ix1, iy0, src0, src1, src2);
+                v01 = __riscv_vzext_vf4(src0, vl);
+                v11 = __riscv_vzext_vf4(src1, vl);
+                v21 = __riscv_vzext_vf4(src2, vl);
+                access(ix0, iy1, src0, src1, src2);
+                v02 = __riscv_vzext_vf4(src0, vl);
+                v12 = __riscv_vzext_vf4(src1, vl);
+                v22 = __riscv_vzext_vf4(src2, vl);
+                access(ix1, iy1, src0, src1, src2);
+                v03 = __riscv_vzext_vf4(src0, vl);
+                v13 = __riscv_vzext_vf4(src1, vl);
+                v23 = __riscv_vzext_vf4(src2, vl);
+
+                auto rx = __riscv_vreinterpret_v_i32m2_u32m2(__riscv_vand(__riscv_vsra(ix, 5, vl), (1 << 5) - 1, vl));
+                auto ry = __riscv_vreinterpret_v_i32m2_u32m2(__riscv_vand(__riscv_vsra(iy, 5, vl), (1 << 5) - 1, vl));
+                v00 = __riscv_vmacc(__riscv_vmul(v00, 1 << 5, vl), rx, __riscv_vsub(v01, v00, vl), vl);
+                v02 = __riscv_vmacc(__riscv_vmul(v02, 1 << 5, vl), rx, __riscv_vsub(v03, v02, vl), vl);
+                v00 = __riscv_vmacc(__riscv_vmul(v00, 1 << 5, vl), ry, __riscv_vsub(v02, v00, vl), vl);
+                v10 = __riscv_vmacc(__riscv_vmul(v10, 1 << 5, vl), rx, __riscv_vsub(v11, v10, vl), vl);
+                v12 = __riscv_vmacc(__riscv_vmul(v12, 1 << 5, vl), rx, __riscv_vsub(v13, v12, vl), vl);
+                v10 = __riscv_vmacc(__riscv_vmul(v10, 1 << 5, vl), ry, __riscv_vsub(v12, v10, vl), vl);
+                v20 = __riscv_vmacc(__riscv_vmul(v20, 1 << 5, vl), rx, __riscv_vsub(v21, v20, vl), vl);
+                v22 = __riscv_vmacc(__riscv_vmul(v22, 1 << 5, vl), rx, __riscv_vsub(v23, v22, vl), vl);
+                v20 = __riscv_vmacc(__riscv_vmul(v20, 1 << 5, vl), ry, __riscv_vsub(v22, v20, vl), vl);
+                vuint8mf2x3_t dst{};
+                dst = __riscv_vset_v_u8mf2_u8mf2x3(dst, 0, __riscv_vnclipu(__riscv_vnclipu(v00, 10, __RISCV_VXRM_RNU, vl), 0, __RISCV_VXRM_RNU, vl));
+                dst = __riscv_vset_v_u8mf2_u8mf2x3(dst, 1, __riscv_vnclipu(__riscv_vnclipu(v10, 10, __RISCV_VXRM_RNU, vl), 0, __RISCV_VXRM_RNU, vl));
+                dst = __riscv_vset_v_u8mf2_u8mf2x3(dst, 2, __riscv_vnclipu(__riscv_vnclipu(v20, 10, __RISCV_VXRM_RNU, vl), 0, __RISCV_VXRM_RNU, vl));
+                __riscv_vsseg3e8(dst_data + i * dst_step + j * 3, dst, vl);
+            }
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+template<bool perspective>
+static inline int warpC4(int start, int end, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, const double* M, int interpolation, int borderType, const double* borderValue)
+{
+    for (int i = start; i < end; i++)
+    {
+        int vl;
+        for (int j = 0; j < dst_width; j += vl)
+        {
+            vl = __riscv_vsetvl_e8mf2(dst_width - j);
+            auto access = [&](vint32m2_t ix, vint32m2_t iy, vuint8mf2_t& src0, vuint8mf2_t& src1, vuint8mf2_t& src2, vuint8mf2_t& src3) {
+                auto ux = __riscv_vreinterpret_v_i32m2_u32m2(__riscv_vmin(__riscv_vmax(ix, 0, vl), src_width  - 1, vl));
+                auto uy = __riscv_vreinterpret_v_i32m2_u32m2(__riscv_vmin(__riscv_vmax(iy, 0, vl), src_height - 1, vl));
+                auto src = __riscv_vloxseg4ei32_v_u8mf2x4(src_data, __riscv_vmadd(uy, src_step, __riscv_vmul(ux, 4, vl), vl), vl);
+                src0 = __riscv_vget_v_u8mf2x4_u8mf2(src, 0);
+                src1 = __riscv_vget_v_u8mf2x4_u8mf2(src, 1);
+                src2 = __riscv_vget_v_u8mf2x4_u8mf2(src, 2);
+                src3 = __riscv_vget_v_u8mf2x4_u8mf2(src, 3);
+                if (borderType == CV_HAL_BORDER_CONSTANT)
+                {
+                    auto mask = __riscv_vmor(__riscv_vmsne(ix, __riscv_vreinterpret_v_u32m2_i32m2(ux), vl), __riscv_vmsne(iy, __riscv_vreinterpret_v_u32m2_i32m2(uy), vl), vl);
+                    src0 = __riscv_vmerge(src0, borderValue[0], mask, vl);
+                    src1 = __riscv_vmerge(src1, borderValue[1], mask, vl);
+                    src2 = __riscv_vmerge(src2, borderValue[2], mask, vl);
+                    src3 = __riscv_vmerge(src3, borderValue[3], mask, vl);
+                }
+            };
+
+            auto id = __riscv_vfcvt_f(__riscv_vadd(__riscv_vid_v_u32m2(vl), j, vl), vl);
+            auto mx = __riscv_vfmadd(id, M[0], __riscv_vfmadd(__riscv_vfmv_v_f_f32m2(i, vl), M[1], __riscv_vfmv_v_f_f32m2(M[2], vl), vl), vl);
+            auto my = __riscv_vfmadd(id, M[3], __riscv_vfmadd(__riscv_vfmv_v_f_f32m2(i, vl), M[4], __riscv_vfmv_v_f_f32m2(M[5], vl), vl), vl);
+            if (perspective)
+            {
+                auto md = __riscv_vfrdiv(__riscv_vfmadd(id, M[6], __riscv_vfmadd(__riscv_vfmv_v_f_f32m2(i, vl), M[7], __riscv_vfmv_v_f_f32m2(M[8], vl), vl), vl), 1, vl);
+                mx = __riscv_vfmul(mx, md, vl);
+                my = __riscv_vfmul(my, md, vl);
+            }
+
+            if (interpolation == CV_HAL_INTER_NEAREST)
+            {
+                auto ix = __riscv_vfcvt_x(mx, vl), iy = __riscv_vfcvt_x(my, vl);
+                vuint8mf2_t src0, src1, src2, src3;
+                access(ix, iy, src0, src1, src2, src3);
+                vuint8mf2x4_t dst{};
+                dst = __riscv_vset_v_u8mf2_u8mf2x4(dst, 0, src0);
+                dst = __riscv_vset_v_u8mf2_u8mf2x4(dst, 1, src1);
+                dst = __riscv_vset_v_u8mf2_u8mf2x4(dst, 2, src2);
+                dst = __riscv_vset_v_u8mf2_u8mf2x4(dst, 3, src3);
+                __riscv_vsseg4e8(dst_data + i * dst_step + j * 4, dst, vl);
+            }
+            else
+            {
+                auto ix = __riscv_vfcvt_x(__riscv_vfmadd(mx, 1 << 10, __riscv_vfmv_v_f_f32m2(1 << 4, vl), vl), vl);
+                auto iy = __riscv_vfcvt_x(__riscv_vfmadd(my, 1 << 10, __riscv_vfmv_v_f_f32m2(1 << 4, vl), vl), vl);
+                auto ix0 = __riscv_vsra(ix, 10, vl), iy0 = __riscv_vsra(iy, 10, vl);
+                auto ix1 = __riscv_vadd(ix0, 1, vl), iy1 = __riscv_vadd(iy0, 1, vl);
+
+                vuint32m2_t v00, v10, v20, v30;
+                vuint32m2_t v01, v11, v21, v31;
+                vuint32m2_t v02, v12, v22, v32;
+                vuint32m2_t v03, v13, v23, v33;
+                vuint8mf2_t src0, src1, src2, src3;
+                access(ix0, iy0, src0, src1, src2, src3);
+                v00 = __riscv_vzext_vf4(src0, vl);
+                v10 = __riscv_vzext_vf4(src1, vl);
+                v20 = __riscv_vzext_vf4(src2, vl);
+                v30 = __riscv_vzext_vf4(src3, vl);
+                access(ix1, iy0, src0, src1, src2, src3);
+                v01 = __riscv_vzext_vf4(src0, vl);
+                v11 = __riscv_vzext_vf4(src1, vl);
+                v21 = __riscv_vzext_vf4(src2, vl);
+                v31 = __riscv_vzext_vf4(src3, vl);
+                access(ix0, iy1, src0, src1, src2, src3);
+                v02 = __riscv_vzext_vf4(src0, vl);
+                v12 = __riscv_vzext_vf4(src1, vl);
+                v22 = __riscv_vzext_vf4(src2, vl);
+                v32 = __riscv_vzext_vf4(src3, vl);
+                access(ix1, iy1, src0, src1, src2, src3);
+                v03 = __riscv_vzext_vf4(src0, vl);
+                v13 = __riscv_vzext_vf4(src1, vl);
+                v23 = __riscv_vzext_vf4(src2, vl);
+                v33 = __riscv_vzext_vf4(src3, vl);
+
+                auto rx = __riscv_vreinterpret_v_i32m2_u32m2(__riscv_vand(__riscv_vsra(ix, 5, vl), (1 << 5) - 1, vl));
+                auto ry = __riscv_vreinterpret_v_i32m2_u32m2(__riscv_vand(__riscv_vsra(iy, 5, vl), (1 << 5) - 1, vl));
+                v00 = __riscv_vmacc(__riscv_vmul(v00, 1 << 5, vl), rx, __riscv_vsub(v01, v00, vl), vl);
+                v02 = __riscv_vmacc(__riscv_vmul(v02, 1 << 5, vl), rx, __riscv_vsub(v03, v02, vl), vl);
+                v00 = __riscv_vmacc(__riscv_vmul(v00, 1 << 5, vl), ry, __riscv_vsub(v02, v00, vl), vl);
+                v10 = __riscv_vmacc(__riscv_vmul(v10, 1 << 5, vl), rx, __riscv_vsub(v11, v10, vl), vl);
+                v12 = __riscv_vmacc(__riscv_vmul(v12, 1 << 5, vl), rx, __riscv_vsub(v13, v12, vl), vl);
+                v10 = __riscv_vmacc(__riscv_vmul(v10, 1 << 5, vl), ry, __riscv_vsub(v12, v10, vl), vl);
+                v20 = __riscv_vmacc(__riscv_vmul(v20, 1 << 5, vl), rx, __riscv_vsub(v21, v20, vl), vl);
+                v22 = __riscv_vmacc(__riscv_vmul(v22, 1 << 5, vl), rx, __riscv_vsub(v23, v22, vl), vl);
+                v20 = __riscv_vmacc(__riscv_vmul(v20, 1 << 5, vl), ry, __riscv_vsub(v22, v20, vl), vl);
+                v30 = __riscv_vmacc(__riscv_vmul(v30, 1 << 5, vl), rx, __riscv_vsub(v31, v30, vl), vl);
+                v32 = __riscv_vmacc(__riscv_vmul(v32, 1 << 5, vl), rx, __riscv_vsub(v33, v32, vl), vl);
+                v30 = __riscv_vmacc(__riscv_vmul(v30, 1 << 5, vl), ry, __riscv_vsub(v32, v30, vl), vl);
+                vuint8mf2x4_t dst{};
+                dst = __riscv_vset_v_u8mf2_u8mf2x4(dst, 0, __riscv_vnclipu(__riscv_vnclipu(v00, 10, __RISCV_VXRM_RNU, vl), 0, __RISCV_VXRM_RNU, vl));
+                dst = __riscv_vset_v_u8mf2_u8mf2x4(dst, 1, __riscv_vnclipu(__riscv_vnclipu(v10, 10, __RISCV_VXRM_RNU, vl), 0, __RISCV_VXRM_RNU, vl));
+                dst = __riscv_vset_v_u8mf2_u8mf2x4(dst, 2, __riscv_vnclipu(__riscv_vnclipu(v20, 10, __RISCV_VXRM_RNU, vl), 0, __RISCV_VXRM_RNU, vl));
+                dst = __riscv_vset_v_u8mf2_u8mf2x4(dst, 3, __riscv_vnclipu(__riscv_vnclipu(v30, 10, __RISCV_VXRM_RNU, vl), 0, __RISCV_VXRM_RNU, vl));
+                __riscv_vsseg4e8(dst_data + i * dst_step + j * 4, dst, vl);
+            }
+        }
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+// the algorithm is copied from 3rdparty/carotene/src/warp_affine.cpp,
+// in the function void CAROTENE_NS::warpAffineNearestNeighbor and void CAROTENE_NS::warpAffineLinear
+inline int warpAffine(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, int dst_height, const double M[6], int interpolation, int borderType, const double borderValue[4])
+{
+    if (src_type != CV_8UC1 && src_type != CV_8UC3 && src_type != CV_8UC4)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    if (borderType != CV_HAL_BORDER_CONSTANT && borderType != CV_HAL_BORDER_REPLICATE)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    if (interpolation != CV_HAL_INTER_NEAREST && interpolation != CV_HAL_INTER_LINEAR)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    switch (src_type)
+    {
+    case CV_8UC1:
+        return remap::invoke(dst_width, dst_height, {warpC1<false>}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, M, interpolation, borderType, borderValue);
+    case CV_8UC3:
+        return remap::invoke(dst_width, dst_height, {warpC3<false>}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, M, interpolation, borderType, borderValue);
+    case CV_8UC4:
+        return remap::invoke(dst_width, dst_height, {warpC4<false>}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, M, interpolation, borderType, borderValue);
+    }
+
+    return CV_HAL_ERROR_NOT_IMPLEMENTED;
+}
+
+// the algorithm is copied from 3rdparty/carotene/src/warp_perspective.cpp,
+// in the function void CAROTENE_NS::warpPerspectiveNearestNeighbor and void CAROTENE_NS::warpPerspectiveLinear
+inline int warpPerspective(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, int dst_height, const double M[9], int interpolation, int borderType, const double borderValue[4])
+{
+    if (src_type != CV_8UC1 && src_type != CV_8UC3 && src_type != CV_8UC4)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    if (borderType != CV_HAL_BORDER_CONSTANT && borderType != CV_HAL_BORDER_REPLICATE)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    if (interpolation != CV_HAL_INTER_NEAREST && interpolation != CV_HAL_INTER_LINEAR)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    switch (src_type)
+    {
+    case CV_8UC1:
+        return remap::invoke(dst_width, dst_height, {warpC1<true>}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, M, interpolation, borderType, borderValue);
+    case CV_8UC3:
+        return remap::invoke(dst_width, dst_height, {warpC3<true>}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, M, interpolation, borderType, borderValue);
+    case CV_8UC4:
+        return remap::invoke(dst_width, dst_height, {warpC4<true>}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, M, interpolation, borderType, borderValue);
+    }
+
+    return CV_HAL_ERROR_NOT_IMPLEMENTED;
+}
+} // cv::cv_hal_rvv::warp
+
+}}
+
+#endif
diff --git a/modules/imgproc/src/hal_replacement.hpp b/modules/imgproc/src/hal_replacement.hpp
index 26aa58e77e..645e9557ed 100644
--- a/modules/imgproc/src/hal_replacement.hpp
+++ b/modules/imgproc/src/hal_replacement.hpp
@@ -373,9 +373,58 @@ inline int hal_ni_remap32f(int src_type, const uchar *src_data, size_t src_step,
                            float* mapx, size_t mapx_step, float* mapy, size_t mapy_step,
                            int interpolation, int border_type, const double border_value[4])
 { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+/**
+   @brief hal_remap with floating point maps
+   @param src_type source and destination image type
+   @param src_data source image data
+   @param src_step source image step
+   @param src_width source image width
+   @param src_height source image height
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param dst_width destination image width
+   @param dst_height destination image height
+   @param map map for xy values
+   @param map_step map matrix step
+   @param interpolation interpolation mode (CV_HAL_INTER_NEAREST, ...)
+   @param border_type border processing mode (CV_HAL_BORDER_REFLECT, ...)
+   @param border_value values to use for CV_HAL_BORDER_CONSTANT mode
+   @sa cv::remap
+ */
+inline int hal_ni_remap32fc2(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height,
+                             uchar *dst_data, size_t dst_step, int dst_width, int dst_height,
+                             float* map, size_t map_step, int interpolation, int border_type, const double border_value[4])
+{ return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+/**
+   @brief hal_remap with fixed-point maps
+   @param src_type source and destination image type
+   @param src_data source image data
+   @param src_step source image step
+   @param src_width source image width
+   @param src_height source image height
+   @param dst_data destination image data
+   @param dst_step destination image step
+   @param dst_width destination image width
+   @param dst_height destination image height
+   @param mapx map for x values
+   @param mapx_step mapx matrix step
+   @param mapy map for y values
+   @param mapy_step mapy matrix step
+   @param interpolation interpolation mode (CV_HAL_INTER_NEAREST, ...)
+   @param border_type border processing mode (CV_HAL_BORDER_REFLECT, ...)
+   @param border_value values to use for CV_HAL_BORDER_CONSTANT mode
+   @sa cv::remap
+ */
+inline int hal_ni_remap16s(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height,
+                           uchar *dst_data, size_t dst_step, int dst_width, int dst_height,
+                           short* mapx, size_t mapx_step, ushort* mapy, size_t mapy_step,
+                           int interpolation, int border_type, const double border_value[4])
+{ return CV_HAL_ERROR_NOT_IMPLEMENTED; }
 
 //! @cond IGNORED
 #define cv_hal_remap32f hal_ni_remap32f
+#define cv_hal_remap32fc2 hal_ni_remap32fc2
+#define cv_hal_remap16s hal_ni_remap16s
 //! @endcond
 
 /**
diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp
index 38f61333cf..f348860549 100644
--- a/modules/imgproc/src/imgwarp.cpp
+++ b/modules/imgproc/src/imgwarp.cpp
@@ -1720,6 +1720,16 @@ void cv::remap( InputArray _src, OutputArray _dst,
         CALL_HAL(remap32f, cv_hal_remap32f, src.type(), src.data, src.step, src.cols, src.rows, dst.data, dst.step, dst.cols, dst.rows,
                  map1.ptr<float>(), map1.step, map2.ptr<float>(), map2.step, interpolation, borderType, borderValue.val);
     }
+    if ((map1.type() == CV_32FC2) && map2.empty())
+    {
+        CALL_HAL(remap32fc2, cv_hal_remap32fc2, src.type(), src.data, src.step, src.cols, src.rows, dst.data, dst.step, dst.cols, dst.rows,
+                 map1.ptr<float>(), map1.step, interpolation, borderType, borderValue.val);
+    }
+    if ((map1.type() == CV_16SC2) && (map2.empty() || map2.type() == CV_16UC1))
+    {
+        CALL_HAL(remap16s, cv_hal_remap16s, src.type(), src.data, src.step, src.cols, src.rows, dst.data, dst.step, dst.cols, dst.rows,
+                 map1.ptr<short>(), map1.step, map2.ptr<ushort>(), map2.step, interpolation, borderType, borderValue.val);
+    }
 
     interpolation &= ~WARP_RELATIVE_MAP;
     if( interpolation == INTER_AREA )