From fa58c1205b5a3149e9b75f74c796593debad319d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=A4=A9=E9=9F=B3=E3=81=82=E3=82=81?= Date: Tue, 25 Mar 2025 16:57:47 +0800 Subject: [PATCH] Merge pull request #27119 from amane-ame:warp_hal_rvv Add RISC-V HAL implementation for cv::warp series #27119 This patch implements `cv_hal_remap`, `cv_hal_warpAffine` and `cv_hal_warpPerspective` using native intrinsics, optimizing the performance of `cv::remap/cv::warpAffine/cv::warpPerspective` for `CV_HAL_INTER_NEAREST/CV_HAL_INTER_LINEAR/CV_HAL_INTER_CUBIC/CV_HAL_INTER_LANCZOS4` modes. Tested on MUSE-PI (Spacemit X60) for both gcc 14.2 and clang 20.0. ``` $ ./opencv_test_imgproc --gtest_filter="*Remap*:*Warp*" $ ./opencv_perf_imgproc --gtest_filter="*Remap*:*remap*:*Warp*" --perf_min_samples=200 --perf_force_samples=200 ``` View the full perf table here: [hal_rvv_warp.pdf](https://github.com/user-attachments/files/19403718/hal_rvv_warp.pdf) ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [ ] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake --- 3rdparty/hal_rvv/hal_rvv.hpp | 1 + 3rdparty/hal_rvv/hal_rvv_1p0/types.hpp | 6 +- 3rdparty/hal_rvv/hal_rvv_1p0/warp.hpp | 1208 +++++++++++++++++++++++ modules/imgproc/src/hal_replacement.hpp | 49 + modules/imgproc/src/imgwarp.cpp | 10 + 5 files changed, 1272 insertions(+), 2 deletions(-) create mode 100644 3rdparty/hal_rvv/hal_rvv_1p0/warp.hpp diff --git a/3rdparty/hal_rvv/hal_rvv.hpp b/3rdparty/hal_rvv/hal_rvv.hpp index 8287ebeda9..4a10906a33 100644 --- a/3rdparty/hal_rvv/hal_rvv.hpp +++ b/3rdparty/hal_rvv/hal_rvv.hpp @@ -50,6 +50,7 @@ #include "hal_rvv_1p0/filter.hpp" // imgproc #include "hal_rvv_1p0/pyramids.hpp" // imgproc #include "hal_rvv_1p0/color.hpp" // imgproc +#include "hal_rvv_1p0/warp.hpp" // imgproc #include "hal_rvv_1p0/thresh.hpp" // imgproc #include "hal_rvv_1p0/histogram.hpp" // imgproc #endif diff --git a/3rdparty/hal_rvv/hal_rvv_1p0/types.hpp b/3rdparty/hal_rvv/hal_rvv_1p0/types.hpp index 79db847eb5..8c8ad23787 100644 --- a/3rdparty/hal_rvv/hal_rvv_1p0/types.hpp +++ b/3rdparty/hal_rvv/hal_rvv_1p0/types.hpp @@ -91,10 +91,12 @@ using RVV_F64M2 = struct RVV; using RVV_F64M4 = struct RVV; using RVV_F64M8 = struct RVV; -// Only for dst type lmul >= 1 template using RVV_SameLen = - RVV; + RVV((RVV_T::lmul <= 8 ? RVV_T::lmul * static_cast(sizeof(Dst_T)) : RVV_T::lmul == 9 ? static_cast(sizeof(Dst_T)) / 2 : RVV_T::lmul == 10 ? static_cast(sizeof(Dst_T)) / 4 : static_cast(sizeof(Dst_T)) / 8) / sizeof(typename RVV_T::ElemType) == 0.5 ? 9 : \ + (RVV_T::lmul <= 8 ? RVV_T::lmul * static_cast(sizeof(Dst_T)) : RVV_T::lmul == 9 ? static_cast(sizeof(Dst_T)) / 2 : RVV_T::lmul == 10 ? static_cast(sizeof(Dst_T)) / 4 : static_cast(sizeof(Dst_T)) / 8) / sizeof(typename RVV_T::ElemType) == 0.25 ? 10 : \ + (RVV_T::lmul <= 8 ? RVV_T::lmul * static_cast(sizeof(Dst_T)) : RVV_T::lmul == 9 ? static_cast(sizeof(Dst_T)) / 2 : RVV_T::lmul == 10 ? static_cast(sizeof(Dst_T)) / 4 : static_cast(sizeof(Dst_T)) / 8) / sizeof(typename RVV_T::ElemType) == 0.125 ? 11 : \ + (RVV_T::lmul <= 8 ? RVV_T::lmul * static_cast(sizeof(Dst_T)) : RVV_T::lmul == 9 ? static_cast(sizeof(Dst_T)) / 2 : RVV_T::lmul == 10 ? static_cast(sizeof(Dst_T)) / 4 : static_cast(sizeof(Dst_T)) / 8) / sizeof(typename RVV_T::ElemType)))>; template struct RVV_ToIntHelper; template struct RVV_ToUintHelper; diff --git a/3rdparty/hal_rvv/hal_rvv_1p0/warp.hpp b/3rdparty/hal_rvv/hal_rvv_1p0/warp.hpp new file mode 100644 index 0000000000..d9fcf9c109 --- /dev/null +++ b/3rdparty/hal_rvv/hal_rvv_1p0/warp.hpp @@ -0,0 +1,1208 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences. + +#ifndef OPENCV_HAL_RVV_WARP_HPP_INCLUDED +#define OPENCV_HAL_RVV_WARP_HPP_INCLUDED + +#include + +namespace cv { namespace cv_hal_rvv { + +namespace remap { +#undef cv_hal_remap32f +#define cv_hal_remap32f cv::cv_hal_rvv::remap::remap32f +#undef cv_hal_remap32fc2 +#define cv_hal_remap32fc2 cv::cv_hal_rvv::remap::remap32fc2 +#undef cv_hal_remap16s +#define cv_hal_remap16s cv::cv_hal_rvv::remap::remap16s + +class RemapInvoker : public ParallelLoopBody +{ +public: + template + RemapInvoker(std::function _func, Args&&... args) + { + func = std::bind(_func, std::placeholders::_1, std::placeholders::_2, std::forward(args)...); + } + + virtual void operator()(const Range& range) const override + { + func(range.start, range.end); + } + +private: + std::function func; +}; + +template +static inline int invoke(int width, int height, std::function func, Args&&... args) +{ + cv::parallel_for_(Range(1, height), RemapInvoker(func, std::forward(args)...), static_cast((width - 1) * height) / (1 << 15)); + return func(0, 1, std::forward(args)...); +} + +template struct rvv; +// NN & LINEAR +template<> struct rvv +{ + static inline vfloat32m8_t vcvt0(vuint8m2_t a, size_t b) { return __riscv_vfcvt_f(__riscv_vzext_vf4(a, b), b); } + static inline vuint8m2_t vcvt1(vfloat32m8_t a, size_t b) { return __riscv_vnclipu(__riscv_vfncvt_xu(a, b), 0, __RISCV_VXRM_RNU, b); } + static inline vuint8m2_t vloxei(const uchar* a, vuint32m8_t b, size_t c) { return __riscv_vloxei32_v_u8m2(a, b, c); } +}; +template<> struct rvv +{ + static inline vfloat32m8_t vcvt0(vuint16m4_t a, size_t b) { return __riscv_vfwcvt_f(a, b); } + static inline vuint16m4_t vcvt1(vfloat32m8_t a, size_t b) { return __riscv_vfncvt_xu(a, b); } + static inline vuint16m4_t vloxei(const ushort* a, vuint32m8_t b, size_t c) { return __riscv_vloxei32_v_u16m4(a, b, c); } +}; +template<> struct rvv +{ + static inline vfloat32m8_t vcvt0(vint16m4_t a, size_t b) { return __riscv_vfwcvt_f(a, b); } + static inline vint16m4_t vcvt1(vfloat32m8_t a, size_t b) { return __riscv_vfncvt_x(a, b); } + static inline vint16m4_t vloxei(const short* a, vuint32m8_t b, size_t c) { return __riscv_vloxei32_v_i16m4(a, b, c); } +}; +template<> struct rvv +{ + static inline vfloat32m8_t vcvt0(vfloat32m8_t a, size_t) { return a; } + static inline vfloat32m8_t vcvt1(vfloat32m8_t a, size_t) { return a; } + static inline vfloat32m8_t vloxei(const float* a, vuint32m8_t b, size_t c) { return __riscv_vloxei32_v_f32m8(a, b, c); } +}; +// CUBIC +template<> struct rvv +{ + static inline vfloat32m1_t vcvt0(vuint8mf4_t a, size_t b) { return __riscv_vfcvt_f(__riscv_vzext_vf4(a, b), b); } + static inline vuint8mf4_t vcvt1(vfloat32m1_t a, size_t b) { return __riscv_vnclipu(__riscv_vfncvt_xu(a, b), 0, __RISCV_VXRM_RNU, b); } + static inline vuint8mf4_t vloxei(const uchar* a, vuint32m1_t b, size_t c) { return __riscv_vloxei32_v_u8mf4(a, b, c); } +}; +template<> struct rvv +{ + static inline vfloat32m1_t vcvt0(vuint16mf2_t a, size_t b) { return __riscv_vfwcvt_f(a, b); } + static inline vuint16mf2_t vcvt1(vfloat32m1_t a, size_t b) { return __riscv_vfncvt_xu(a, b); } + static inline vuint16mf2_t vloxei(const ushort* a, vuint32m1_t b, size_t c) { return __riscv_vloxei32_v_u16mf2(a, b, c); } +}; +template<> struct rvv +{ + static inline vfloat32m1_t vcvt0(vint16mf2_t a, size_t b) { return __riscv_vfwcvt_f(a, b); } + static inline vint16mf2_t vcvt1(vfloat32m1_t a, size_t b) { return __riscv_vfncvt_x(a, b); } + static inline vint16mf2_t vloxei(const short* a, vuint32m1_t b, size_t c) { return __riscv_vloxei32_v_i16mf2(a, b, c); } +}; +template<> struct rvv +{ + static inline vfloat32m1_t vcvt0(vfloat32m1_t a, size_t) { return a; } + static inline vfloat32m1_t vcvt1(vfloat32m1_t a, size_t) { return a; } + static inline vfloat32m1_t vloxei(const float* a, vuint32m1_t b, size_t c) { return __riscv_vloxei32_v_f32m1(a, b, c); } +}; +// LANCZOS4 +template<> struct rvv +{ + static inline vfloat32m2_t vcvt0(vuint8mf2_t a, size_t b) { return __riscv_vfcvt_f(__riscv_vzext_vf4(a, b), b); } + static inline vuint8mf2_t vcvt1(vfloat32m2_t a, size_t b) { return __riscv_vnclipu(__riscv_vfncvt_xu(a, b), 0, __RISCV_VXRM_RNU, b); } + static inline vuint8mf2_t vloxei(const uchar* a, vuint32m2_t b, size_t c) { return __riscv_vloxei32_v_u8mf2(a, b, c); } +}; +template<> struct rvv +{ + static inline vfloat32m2_t vcvt0(vuint16m1_t a, size_t b) { return __riscv_vfwcvt_f(a, b); } + static inline vuint16m1_t vcvt1(vfloat32m2_t a, size_t b) { return __riscv_vfncvt_xu(a, b); } + static inline vuint16m1_t vloxei(const ushort* a, vuint32m2_t b, size_t c) { return __riscv_vloxei32_v_u16m1(a, b, c); } +}; +template<> struct rvv +{ + static inline vfloat32m2_t vcvt0(vint16m1_t a, size_t b) { return __riscv_vfwcvt_f(a, b); } + static inline vint16m1_t vcvt1(vfloat32m2_t a, size_t b) { return __riscv_vfncvt_x(a, b); } + static inline vint16m1_t vloxei(const short* a, vuint32m2_t b, size_t c) { return __riscv_vloxei32_v_i16m1(a, b, c); } +}; +template<> struct rvv +{ + static inline vfloat32m2_t vcvt0(vfloat32m2_t a, size_t) { return a; } + static inline vfloat32m2_t vcvt1(vfloat32m2_t a, size_t) { return a; } + static inline vfloat32m2_t vloxei(const float* a, vuint32m2_t b, size_t c) { return __riscv_vloxei32_v_f32m2(a, b, c); } +}; + +template +static inline int remap32fC1(int start, int end, bool s16, const uchar *src_data, size_t src_step, int src_width, int src_height, + uchar *dst_data, size_t dst_step, int dst_width, + const float* mapx, size_t mapx_step, const float* mapy, size_t mapy_step, + int interpolation, int border_type, const double* border_value) +{ + using T = typename helper::ElemType; + const int mode = interpolation & ~CV_HAL_WARP_RELATIVE_MAP; + + for (int i = start; i < end; i++) + { + int vl; + for (int j = 0; j < dst_width; j += vl) + { + vl = helper::setvl(dst_width - j); + typename RVV_SameLen::VecType mx, my; + if (s16) + { + auto map = __riscv_vlseg2e16_v_i16m4x2(reinterpret_cast(mapx) + i * mapx_step + j * 2, vl); + mx = __riscv_vfwcvt_f(__riscv_vget_v_i16m4x2_i16m4(map, 0), vl); + my = __riscv_vfwcvt_f(__riscv_vget_v_i16m4x2_i16m4(map, 1), vl); + } + else + { + if (mapy == nullptr) + { + mx = RVV_SameLen::vload_stride(mapx + i * mapx_step + j * 2 , sizeof(float) * 2, vl); + my = RVV_SameLen::vload_stride(mapx + i * mapx_step + j * 2 + 1, sizeof(float) * 2, vl); + } + else + { + mx = RVV_SameLen::vload(mapx + i * mapx_step + j, vl); + my = RVV_SameLen::vload(mapy + i * mapy_step + j, vl); + } + } + if (interpolation & CV_HAL_WARP_RELATIVE_MAP) + { + mx = __riscv_vfadd(mx, __riscv_vfcvt_f(__riscv_vadd(RVV_SameLen::vid(vl), j, vl), vl), vl); + my = __riscv_vfadd(my, i, vl); + } + + auto access = [&](typename RVV_SameLen::VecType ix, typename RVV_SameLen::VecType iy) { + auto ux = RVV_SameLen::reinterpret(__riscv_vmin(__riscv_vmax(ix, 0, vl), src_width - 1, vl)); + auto uy = RVV_SameLen::reinterpret(__riscv_vmin(__riscv_vmax(iy, 0, vl), src_height - 1, vl)); + auto src = rvv::vloxei(reinterpret_cast(src_data), __riscv_vmadd(uy, src_step, __riscv_vmul(ux, sizeof(T), vl), vl), vl); + if (border_type == CV_HAL_BORDER_CONSTANT) + { + auto mask = __riscv_vmor(__riscv_vmsne(ix, RVV_SameLen::reinterpret(ux), vl), __riscv_vmsne(iy, RVV_SameLen::reinterpret(uy), vl), vl); + src = __riscv_vmerge(src, helper::vmv(border_value[0], vl), mask, vl); + } + return src; + }; + if (mode == CV_HAL_INTER_NEAREST) + { + auto ix = __riscv_vfcvt_x(mx, vl), iy = __riscv_vfcvt_x(my, vl); + helper::vstore(reinterpret_cast(dst_data + i * dst_step) + j, access(ix, iy), vl); + } + else if (mode == CV_HAL_INTER_LINEAR) + { + typename RVV_SameLen::VecType ix0, iy0; + if (s16) + { + ix0 = __riscv_vfcvt_x(mx, vl); + iy0 = __riscv_vfcvt_x(my, vl); + auto md = __riscv_vle16_v_u16m4(reinterpret_cast(mapy) + i * mapy_step + j, vl); + mx = __riscv_vfdiv(__riscv_vfwcvt_f(__riscv_vand(md, 31, vl), vl), 32, vl); + my = __riscv_vfdiv(__riscv_vfwcvt_f(__riscv_vand(__riscv_vsrl(md, 5, vl), 31, vl), vl), 32, vl); + } + else + { + auto imx = __riscv_vfcvt_x(__riscv_vfmul(mx, 32, vl), vl); + auto imy = __riscv_vfcvt_x(__riscv_vfmul(my, 32, vl), vl); + ix0 = __riscv_vsra(imx, 5, vl); + iy0 = __riscv_vsra(imy, 5, vl); + mx = __riscv_vfdiv(__riscv_vfcvt_f(__riscv_vand(imx, 31, vl), vl), 32, vl); + my = __riscv_vfdiv(__riscv_vfcvt_f(__riscv_vand(imy, 31, vl), vl), 32, vl); + } + auto ix1 = __riscv_vadd(ix0, 1, vl), iy1 = __riscv_vadd(iy0, 1, vl); + auto v0 = rvv::vcvt0(access(ix0, iy0), vl); + auto v1 = rvv::vcvt0(access(ix1, iy0), vl); + auto v2 = rvv::vcvt0(access(ix0, iy1), vl); + auto v3 = rvv::vcvt0(access(ix1, iy1), vl); + + v0 = __riscv_vfmacc(v0, mx, __riscv_vfsub(v1, v0, vl), vl); + v2 = __riscv_vfmacc(v2, mx, __riscv_vfsub(v3, v2, vl), vl); + v0 = __riscv_vfmacc(v0, my, __riscv_vfsub(v2, v0, vl), vl); + helper::vstore(reinterpret_cast(dst_data + i * dst_step) + j, rvv::vcvt1(v0, vl), vl); + } + else + { + return CV_HAL_ERROR_NOT_IMPLEMENTED; + } + } + } + + return CV_HAL_ERROR_OK; +} + +class RemapTable +{ +private: + RemapTable() + { + // the algorithm is copied from imgproc/src/imgwarp.cpp, + // in the function static void interpolateLanczos4 + constexpr double s45 = 0.70710678118654752440084436210485; + constexpr double cs[][2] = {{1, 0}, {-s45, -s45}, {0, 1}, {s45, -s45}, {-1, 0}, {s45, s45}, {0, -1}, {-s45, s45}}; + + for (int t = 0; t < 32; t++) + { + float x = t / 32.0f; + if (x < FLT_EPSILON) + { + for (int i = 0; i < 8; i++) + coeffs[t*8+i] = 0; + coeffs[t*8+3] = 1; + return; + } + + float sum = 0; + double y0=-(x+3)*CV_PI*0.25, s0 = std::sin(y0), c0= std::cos(y0); + for (int i = 0; i < 8; i++) + { + double y = -(x+3-i)*CV_PI*0.25; + coeffs[t*8+i] = (float)((cs[i][0]*s0 + cs[i][1]*c0)/(y*y)); + sum += coeffs[t*8+i]; + } + + sum = 1.f/sum; + for (int i = 0; i < 8; i++) + coeffs[t*8+i] *= sum; + } + } + +public: + float coeffs[32 * 8]; + + static RemapTable& instance() + { + static RemapTable tab; + return tab; + } +}; + +template +static inline int remap32fCubic(int start, int end, bool s16, const uchar *src_data, size_t src_step, int src_width, int src_height, + uchar *dst_data, size_t dst_step, int dst_width, + const float* mapx, size_t mapx_step, const float* mapy, size_t mapy_step, + int interpolation, int border_type, const double* border_value) +{ + using T = typename helper::ElemType; + + for (int i = start; i < end; i++) + { + int vl; + for (int j = 0; j < dst_width; j += vl) + { + vl = helper::setvl(dst_width - j); + typename RVV_SameLen::VecType mx, my; + if (s16) + { + auto map = __riscv_vlseg2e16_v_i16mf2x2(reinterpret_cast(mapx) + i * mapx_step + j * 2, vl); + mx = __riscv_vfwcvt_f(__riscv_vget_v_i16mf2x2_i16mf2(map, 0), vl); + my = __riscv_vfwcvt_f(__riscv_vget_v_i16mf2x2_i16mf2(map, 1), vl); + } + else + { + if (mapy == nullptr) + { + auto map = __riscv_vlseg2e32_v_f32m1x2(mapx + i * mapx_step + j * 2, vl); + mx = __riscv_vget_v_f32m1x2_f32m1(map, 0); + my = __riscv_vget_v_f32m1x2_f32m1(map, 1); + } + else + { + mx = RVV_SameLen::vload(mapx + i * mapx_step + j, vl); + my = RVV_SameLen::vload(mapy + i * mapy_step + j, vl); + } + } + if (interpolation & CV_HAL_WARP_RELATIVE_MAP) + { + mx = __riscv_vfadd(mx, __riscv_vfcvt_f(__riscv_vadd(RVV_SameLen::vid(vl), j, vl), vl), vl); + my = __riscv_vfadd(my, i, vl); + } + + auto access = [&](typename RVV_SameLen::VecType ix, typename RVV_SameLen::VecType iy) { + auto ux = RVV_SameLen::reinterpret(__riscv_vmin(__riscv_vmax(ix, 0, vl), src_width - 1, vl)); + auto uy = RVV_SameLen::reinterpret(__riscv_vmin(__riscv_vmax(iy, 0, vl), src_height - 1, vl)); + auto src = rvv::vloxei(reinterpret_cast(src_data), __riscv_vmadd(uy, src_step, __riscv_vmul(ux, sizeof(T), vl), vl), vl); + if (border_type == CV_HAL_BORDER_CONSTANT) + { + auto mask = __riscv_vmor(__riscv_vmsne(ix, RVV_SameLen::reinterpret(ux), vl), __riscv_vmsne(iy, RVV_SameLen::reinterpret(uy), vl), vl); + src = __riscv_vmerge(src, helper::vmv(border_value[0], vl), mask, vl); + } + return src; + }; + + typename RVV_SameLen::VecType ix1, iy1; + if (s16) + { + ix1 = __riscv_vfcvt_x(mx, vl); + iy1 = __riscv_vfcvt_x(my, vl); + auto md = __riscv_vle16_v_u16mf2(reinterpret_cast(mapy) + i * mapy_step + j, vl); + mx = __riscv_vfdiv(__riscv_vfwcvt_f(__riscv_vand(md, 31, vl), vl), 32, vl); + my = __riscv_vfdiv(__riscv_vfwcvt_f(__riscv_vand(__riscv_vsrl(md, 5, vl), 31, vl), vl), 32, vl); + } + else + { + auto imx = __riscv_vfcvt_x(__riscv_vfmul(mx, 32, vl), vl); + auto imy = __riscv_vfcvt_x(__riscv_vfmul(my, 32, vl), vl); + ix1 = __riscv_vsra(imx, 5, vl); + iy1 = __riscv_vsra(imy, 5, vl); + mx = __riscv_vfdiv(__riscv_vfcvt_f(__riscv_vand(imx, 31, vl), vl), 32, vl); + my = __riscv_vfdiv(__riscv_vfcvt_f(__riscv_vand(imy, 31, vl), vl), 32, vl); + } + auto ix0 = __riscv_vsub(ix1, 1, vl), iy0 = __riscv_vsub(iy1, 1, vl); + auto ix2 = __riscv_vadd(ix1, 1, vl), iy2 = __riscv_vadd(iy1, 1, vl); + auto ix3 = __riscv_vadd(ix1, 2, vl), iy3 = __riscv_vadd(iy1, 2, vl); + + // the algorithm is copied from imgproc/src/imgwarp.cpp, + // in the function static void interpolateCubic + typename RVV_SameLen::VecType c0, c1, c2, c3; + auto intertab = [&](typename RVV_SameLen::VecType x) { + constexpr float A = -0.75f; + x = __riscv_vfadd(x, 1, vl); + c0 = __riscv_vfmadd(__riscv_vfmadd(__riscv_vfmadd(x, A, RVV_SameLen::vmv(-5 * A, vl), vl), x, RVV_SameLen::vmv(8 * A, vl), vl), x, RVV_SameLen::vmv(-4 * A, vl), vl); + x = __riscv_vfsub(x, 1, vl); + c1 = __riscv_vfmadd(__riscv_vfmul(__riscv_vfmadd(x, A + 2, RVV_SameLen::vmv(-(A + 3), vl), vl), x, vl), x, RVV_SameLen::vmv(1, vl), vl); + x = __riscv_vfrsub(x, 1, vl); + c2 = __riscv_vfmadd(__riscv_vfmul(__riscv_vfmadd(x, A + 2, RVV_SameLen::vmv(-(A + 3), vl), vl), x, vl), x, RVV_SameLen::vmv(1, vl), vl); + c3 = __riscv_vfsub(__riscv_vfsub(__riscv_vfrsub(c0, 1, vl), c1, vl), c2, vl); + }; + + intertab(mx); + auto v0 = rvv::vcvt0(access(ix0, iy0), vl); + auto v1 = rvv::vcvt0(access(ix1, iy0), vl); + auto v2 = rvv::vcvt0(access(ix2, iy0), vl); + auto v3 = rvv::vcvt0(access(ix3, iy0), vl); + auto k0 = __riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmul(v0, c0, vl), v1, c1, vl), v2, c2, vl), v3, c3, vl); + v0 = rvv::vcvt0(access(ix0, iy1), vl); + v1 = rvv::vcvt0(access(ix1, iy1), vl); + v2 = rvv::vcvt0(access(ix2, iy1), vl); + v3 = rvv::vcvt0(access(ix3, iy1), vl); + auto k1 = __riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmul(v0, c0, vl), v1, c1, vl), v2, c2, vl), v3, c3, vl); + v0 = rvv::vcvt0(access(ix0, iy2), vl); + v1 = rvv::vcvt0(access(ix1, iy2), vl); + v2 = rvv::vcvt0(access(ix2, iy2), vl); + v3 = rvv::vcvt0(access(ix3, iy2), vl); + auto k2 = __riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmul(v0, c0, vl), v1, c1, vl), v2, c2, vl), v3, c3, vl); + v0 = rvv::vcvt0(access(ix0, iy3), vl); + v1 = rvv::vcvt0(access(ix1, iy3), vl); + v2 = rvv::vcvt0(access(ix2, iy3), vl); + v3 = rvv::vcvt0(access(ix3, iy3), vl); + auto k3 = __riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmul(v0, c0, vl), v1, c1, vl), v2, c2, vl), v3, c3, vl); + + intertab(my); + k0 = __riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmul(k0, c0, vl), k1, c1, vl), k2, c2, vl), k3, c3, vl); + + helper::vstore(reinterpret_cast(dst_data + i * dst_step) + j, rvv::vcvt1(k0, vl), vl); + } + } + + return CV_HAL_ERROR_OK; +} + +template +static inline int remap32fLanczos4(int start, int end, const uchar *src_data, size_t src_step, int src_width, int src_height, + uchar *dst_data, size_t dst_step, int dst_width, + const float* mapx, size_t mapx_step, const float* mapy, size_t mapy_step, + int interpolation, int border_type, const double* border_value) +{ + using T = typename helper::ElemType; + + for (int i = start; i < end; i++) + { + int vl; + for (int j = 0; j < dst_width; j += vl) + { + vl = helper::setvl(dst_width - j); + typename RVV_SameLen::VecType mx, my; + if (s16) + { + auto map = __riscv_vlseg2e16_v_i16m1x2(reinterpret_cast(mapx) + i * mapx_step + j * 2, vl); + mx = __riscv_vfwcvt_f(__riscv_vget_v_i16m1x2_i16m1(map, 0), vl); + my = __riscv_vfwcvt_f(__riscv_vget_v_i16m1x2_i16m1(map, 1), vl); + } + else + { + if (mapy == nullptr) + { + auto map = __riscv_vlseg2e32_v_f32m2x2(mapx + i * mapx_step + j * 2, vl); + mx = __riscv_vget_v_f32m2x2_f32m2(map, 0); + my = __riscv_vget_v_f32m2x2_f32m2(map, 1); + } + else + { + mx = RVV_SameLen::vload(mapx + i * mapx_step + j, vl); + my = RVV_SameLen::vload(mapy + i * mapy_step + j, vl); + } + } + if (interpolation & CV_HAL_WARP_RELATIVE_MAP) + { + mx = __riscv_vfadd(mx, __riscv_vfcvt_f(__riscv_vadd(RVV_SameLen::vid(vl), j, vl), vl), vl); + my = __riscv_vfadd(my, i, vl); + } + + auto access = [&](typename RVV_SameLen::VecType ix, typename RVV_SameLen::VecType iy) { + auto ux = RVV_SameLen::reinterpret(__riscv_vmin(__riscv_vmax(ix, 0, vl), src_width - 1, vl)); + auto uy = RVV_SameLen::reinterpret(__riscv_vmin(__riscv_vmax(iy, 0, vl), src_height - 1, vl)); + auto src = rvv::vloxei(reinterpret_cast(src_data), __riscv_vmadd(uy, src_step, __riscv_vmul(ux, sizeof(T), vl), vl), vl); + if (border_type == CV_HAL_BORDER_CONSTANT) + { + auto mask = __riscv_vmor(__riscv_vmsne(ix, RVV_SameLen::reinterpret(ux), vl), __riscv_vmsne(iy, RVV_SameLen::reinterpret(uy), vl), vl); + src = __riscv_vmerge(src, helper::vmv(border_value[0], vl), mask, vl); + } + return src; + }; + + typename RVV_SameLen::VecType ix3, iy3; + typename RVV_SameLen::VecType imx, imy; + if (s16) + { + ix3 = __riscv_vfcvt_x(mx, vl); + iy3 = __riscv_vfcvt_x(my, vl); + auto md = __riscv_vle16_v_u16m1(reinterpret_cast(mapy) + i * mapy_step + j, vl); + imx = __riscv_vand(md, 31, vl); + imy = __riscv_vand(__riscv_vsrl(md, 5, vl), 31, vl); + } + else + { + auto dmx = __riscv_vfcvt_x(__riscv_vfmul(mx, 32, vl), vl); + auto dmy = __riscv_vfcvt_x(__riscv_vfmul(my, 32, vl), vl); + ix3 = __riscv_vsra(dmx, 5, vl); + iy3 = __riscv_vsra(dmy, 5, vl); + imx = __riscv_vncvt_x(__riscv_vreinterpret_v_i32m2_u32m2(__riscv_vand(dmx, 31, vl)), vl); + imy = __riscv_vncvt_x(__riscv_vreinterpret_v_i32m2_u32m2(__riscv_vand(dmy, 31, vl)), vl); + } + auto ix0 = __riscv_vsub(ix3, 3, vl), iy0 = __riscv_vsub(iy3, 3, vl); + auto ix1 = __riscv_vsub(ix3, 2, vl), iy1 = __riscv_vsub(iy3, 2, vl); + auto ix2 = __riscv_vsub(ix3, 1, vl), iy2 = __riscv_vsub(iy3, 1, vl); + auto ix4 = __riscv_vadd(ix3, 1, vl), iy4 = __riscv_vadd(iy3, 1, vl); + auto ix5 = __riscv_vadd(ix3, 2, vl), iy5 = __riscv_vadd(iy3, 2, vl); + auto ix6 = __riscv_vadd(ix3, 3, vl), iy6 = __riscv_vadd(iy3, 3, vl); + auto ix7 = __riscv_vadd(ix3, 4, vl), iy7 = __riscv_vadd(iy3, 4, vl); + + typename RVV_SameLen::VecType c0, c1, c2, c3, c4, c5, c6, c7; + auto intertab = [&](typename RVV_SameLen::VecType x) { + x = __riscv_vmul(x, sizeof(float) * 8, vl); + auto val = __riscv_vloxseg4ei16_v_f32m2x4(RemapTable::instance().coeffs, x, vl); + c0 = __riscv_vget_v_f32m2x4_f32m2(val, 0); + c1 = __riscv_vget_v_f32m2x4_f32m2(val, 1); + c2 = __riscv_vget_v_f32m2x4_f32m2(val, 2); + c3 = __riscv_vget_v_f32m2x4_f32m2(val, 3); + val = __riscv_vloxseg4ei16_v_f32m2x4(RemapTable::instance().coeffs, __riscv_vadd(x, sizeof(float) * 4, vl), vl); + c4 = __riscv_vget_v_f32m2x4_f32m2(val, 0); + c5 = __riscv_vget_v_f32m2x4_f32m2(val, 1); + c6 = __riscv_vget_v_f32m2x4_f32m2(val, 2); + c7 = __riscv_vget_v_f32m2x4_f32m2(val, 3); + }; + + intertab(imx); + auto v0 = rvv::vcvt0(access(ix0, iy0), vl); + auto v1 = rvv::vcvt0(access(ix1, iy0), vl); + auto v2 = rvv::vcvt0(access(ix2, iy0), vl); + auto v3 = rvv::vcvt0(access(ix3, iy0), vl); + auto v4 = rvv::vcvt0(access(ix4, iy0), vl); + auto v5 = rvv::vcvt0(access(ix5, iy0), vl); + auto v6 = rvv::vcvt0(access(ix6, iy0), vl); + auto v7 = rvv::vcvt0(access(ix7, iy0), vl); + auto k0 = __riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmul(v0, c0, vl), v1, c1, vl), v2, c2, vl), v3, c3, vl), v4, c4, vl), v5, c5, vl), v6, c6, vl), v7, c7, vl); + v0 = rvv::vcvt0(access(ix0, iy1), vl); + v1 = rvv::vcvt0(access(ix1, iy1), vl); + v2 = rvv::vcvt0(access(ix2, iy1), vl); + v3 = rvv::vcvt0(access(ix3, iy1), vl); + v4 = rvv::vcvt0(access(ix4, iy1), vl); + v5 = rvv::vcvt0(access(ix5, iy1), vl); + v6 = rvv::vcvt0(access(ix6, iy1), vl); + v7 = rvv::vcvt0(access(ix7, iy1), vl); + auto k1 = __riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmul(v0, c0, vl), v1, c1, vl), v2, c2, vl), v3, c3, vl), v4, c4, vl), v5, c5, vl), v6, c6, vl), v7, c7, vl); + v0 = rvv::vcvt0(access(ix0, iy2), vl); + v1 = rvv::vcvt0(access(ix1, iy2), vl); + v2 = rvv::vcvt0(access(ix2, iy2), vl); + v3 = rvv::vcvt0(access(ix3, iy2), vl); + v4 = rvv::vcvt0(access(ix4, iy2), vl); + v5 = rvv::vcvt0(access(ix5, iy2), vl); + v6 = rvv::vcvt0(access(ix6, iy2), vl); + v7 = rvv::vcvt0(access(ix7, iy2), vl); + auto k2 = __riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmul(v0, c0, vl), v1, c1, vl), v2, c2, vl), v3, c3, vl), v4, c4, vl), v5, c5, vl), v6, c6, vl), v7, c7, vl); + v0 = rvv::vcvt0(access(ix0, iy3), vl); + v1 = rvv::vcvt0(access(ix1, iy3), vl); + v2 = rvv::vcvt0(access(ix2, iy3), vl); + v3 = rvv::vcvt0(access(ix3, iy3), vl); + v4 = rvv::vcvt0(access(ix4, iy3), vl); + v5 = rvv::vcvt0(access(ix5, iy3), vl); + v6 = rvv::vcvt0(access(ix6, iy3), vl); + v7 = rvv::vcvt0(access(ix7, iy3), vl); + auto k3 = __riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmul(v0, c0, vl), v1, c1, vl), v2, c2, vl), v3, c3, vl), v4, c4, vl), v5, c5, vl), v6, c6, vl), v7, c7, vl); + v0 = rvv::vcvt0(access(ix0, iy4), vl); + v1 = rvv::vcvt0(access(ix1, iy4), vl); + v2 = rvv::vcvt0(access(ix2, iy4), vl); + v3 = rvv::vcvt0(access(ix3, iy4), vl); + v4 = rvv::vcvt0(access(ix4, iy4), vl); + v5 = rvv::vcvt0(access(ix5, iy4), vl); + v6 = rvv::vcvt0(access(ix6, iy4), vl); + v7 = rvv::vcvt0(access(ix7, iy4), vl); + auto k4 = __riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmul(v0, c0, vl), v1, c1, vl), v2, c2, vl), v3, c3, vl), v4, c4, vl), v5, c5, vl), v6, c6, vl), v7, c7, vl); + v0 = rvv::vcvt0(access(ix0, iy5), vl); + v1 = rvv::vcvt0(access(ix1, iy5), vl); + v2 = rvv::vcvt0(access(ix2, iy5), vl); + v3 = rvv::vcvt0(access(ix3, iy5), vl); + v4 = rvv::vcvt0(access(ix4, iy5), vl); + v5 = rvv::vcvt0(access(ix5, iy5), vl); + v6 = rvv::vcvt0(access(ix6, iy5), vl); + v7 = rvv::vcvt0(access(ix7, iy5), vl); + auto k5 = __riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmul(v0, c0, vl), v1, c1, vl), v2, c2, vl), v3, c3, vl), v4, c4, vl), v5, c5, vl), v6, c6, vl), v7, c7, vl); + v0 = rvv::vcvt0(access(ix0, iy6), vl); + v1 = rvv::vcvt0(access(ix1, iy6), vl); + v2 = rvv::vcvt0(access(ix2, iy6), vl); + v3 = rvv::vcvt0(access(ix3, iy6), vl); + v4 = rvv::vcvt0(access(ix4, iy6), vl); + v5 = rvv::vcvt0(access(ix5, iy6), vl); + v6 = rvv::vcvt0(access(ix6, iy6), vl); + v7 = rvv::vcvt0(access(ix7, iy6), vl); + auto k6 = __riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmul(v0, c0, vl), v1, c1, vl), v2, c2, vl), v3, c3, vl), v4, c4, vl), v5, c5, vl), v6, c6, vl), v7, c7, vl); + v0 = rvv::vcvt0(access(ix0, iy7), vl); + v1 = rvv::vcvt0(access(ix1, iy7), vl); + v2 = rvv::vcvt0(access(ix2, iy7), vl); + v3 = rvv::vcvt0(access(ix3, iy7), vl); + v4 = rvv::vcvt0(access(ix4, iy7), vl); + v5 = rvv::vcvt0(access(ix5, iy7), vl); + v6 = rvv::vcvt0(access(ix6, iy7), vl); + v7 = rvv::vcvt0(access(ix7, iy7), vl); + auto k7 = __riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmul(v0, c0, vl), v1, c1, vl), v2, c2, vl), v3, c3, vl), v4, c4, vl), v5, c5, vl), v6, c6, vl), v7, c7, vl); + + intertab(imy); + k0 = __riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmacc(__riscv_vfmul(k0, c0, vl), k1, c1, vl), k2, c2, vl), k3, c3, vl), k4, c4, vl), k5, c5, vl), k6, c6, vl), k7, c7, vl); + + helper::vstore(reinterpret_cast(dst_data + i * dst_step) + j, rvv::vcvt1(k0, vl), vl); + } + } + + return CV_HAL_ERROR_OK; +} + +static inline int remap32fC3(int start, int end, const uchar *src_data, size_t src_step, int src_width, int src_height, + uchar *dst_data, size_t dst_step, int dst_width, + const float* mapx, size_t mapx_step, const float* mapy, size_t mapy_step, + int interpolation, int border_type, const double* border_value) +{ + for (int i = start; i < end; i++) + { + int vl; + for (int j = 0; j < dst_width; j += vl) + { + vl = __riscv_vsetvl_e8mf2(dst_width - j); + vfloat32m2_t mx, my; + if (mapy == nullptr) + { + auto map = __riscv_vlseg2e32_v_f32m2x2(mapx + i * mapx_step + j * 2, vl); + mx = __riscv_vget_v_f32m2x2_f32m2(map, 0); + my = __riscv_vget_v_f32m2x2_f32m2(map, 1); + } + else + { + mx = __riscv_vle32_v_f32m2(mapx + i * mapx_step + j, vl); + my = __riscv_vle32_v_f32m2(mapy + i * mapy_step + j, vl); + } + if (interpolation & CV_HAL_WARP_RELATIVE_MAP) + { + mx = __riscv_vfadd(mx, __riscv_vfcvt_f(__riscv_vadd(__riscv_vid_v_u32m2(vl), j, vl), vl), vl); + my = __riscv_vfadd(my, i, vl); + } + + auto access = [&](vint32m2_t ix, vint32m2_t iy, vuint8mf2_t& src0, vuint8mf2_t& src1, vuint8mf2_t& src2) { + auto ux = __riscv_vreinterpret_v_i32m2_u32m2(__riscv_vmin(__riscv_vmax(ix, 0, vl), src_width - 1, vl)); + auto uy = __riscv_vreinterpret_v_i32m2_u32m2(__riscv_vmin(__riscv_vmax(iy, 0, vl), src_height - 1, vl)); + auto src = __riscv_vloxseg3ei32_v_u8mf2x3(src_data, __riscv_vmadd(uy, src_step, __riscv_vmul(ux, 3, vl), vl), vl); + src0 = __riscv_vget_v_u8mf2x3_u8mf2(src, 0); + src1 = __riscv_vget_v_u8mf2x3_u8mf2(src, 1); + src2 = __riscv_vget_v_u8mf2x3_u8mf2(src, 2); + if (border_type == CV_HAL_BORDER_CONSTANT) + { + auto mask = __riscv_vmor(__riscv_vmsne(ix, __riscv_vreinterpret_v_u32m2_i32m2(ux), vl), __riscv_vmsne(iy, __riscv_vreinterpret_v_u32m2_i32m2(uy), vl), vl); + src0 = __riscv_vmerge(src0, border_value[0], mask, vl); + src1 = __riscv_vmerge(src1, border_value[1], mask, vl); + src2 = __riscv_vmerge(src2, border_value[2], mask, vl); + } + }; + if ((interpolation & ~CV_HAL_WARP_RELATIVE_MAP) == CV_HAL_INTER_NEAREST) + { + auto ix = __riscv_vfcvt_x(mx, vl), iy = __riscv_vfcvt_x(my, vl); + vuint8mf2_t src0, src1, src2; + access(ix, iy, src0, src1, src2); + vuint8mf2x3_t dst{}; + dst = __riscv_vset_v_u8mf2_u8mf2x3(dst, 0, src0); + dst = __riscv_vset_v_u8mf2_u8mf2x3(dst, 1, src1); + dst = __riscv_vset_v_u8mf2_u8mf2x3(dst, 2, src2); + __riscv_vsseg3e8(dst_data + i * dst_step + j * 3, dst, vl); + } + else + { + auto imx = __riscv_vfcvt_x(__riscv_vfmul(mx, 32, vl), vl); + auto imy = __riscv_vfcvt_x(__riscv_vfmul(my, 32, vl), vl); + auto ix0 = __riscv_vsra(imx, 5, vl); + auto iy0 = __riscv_vsra(imy, 5, vl); + auto ix1 = __riscv_vadd(ix0, 1, vl), iy1 = __riscv_vadd(iy0, 1, vl); + mx = __riscv_vfdiv(__riscv_vfcvt_f(__riscv_vand(imx, 31, vl), vl), 32, vl); + my = __riscv_vfdiv(__riscv_vfcvt_f(__riscv_vand(imy, 31, vl), vl), 32, vl); + + vfloat32m2_t v00, v10, v20; + vfloat32m2_t v01, v11, v21; + vfloat32m2_t v02, v12, v22; + vfloat32m2_t v03, v13, v23; + vuint8mf2_t src0, src1, src2; + access(ix0, iy0, src0, src1, src2); + v00 = __riscv_vfcvt_f(__riscv_vzext_vf4(src0, vl), vl); + v10 = __riscv_vfcvt_f(__riscv_vzext_vf4(src1, vl), vl); + v20 = __riscv_vfcvt_f(__riscv_vzext_vf4(src2, vl), vl); + access(ix1, iy0, src0, src1, src2); + v01 = __riscv_vfcvt_f(__riscv_vzext_vf4(src0, vl), vl); + v11 = __riscv_vfcvt_f(__riscv_vzext_vf4(src1, vl), vl); + v21 = __riscv_vfcvt_f(__riscv_vzext_vf4(src2, vl), vl); + access(ix0, iy1, src0, src1, src2); + v02 = __riscv_vfcvt_f(__riscv_vzext_vf4(src0, vl), vl); + v12 = __riscv_vfcvt_f(__riscv_vzext_vf4(src1, vl), vl); + v22 = __riscv_vfcvt_f(__riscv_vzext_vf4(src2, vl), vl); + access(ix1, iy1, src0, src1, src2); + v03 = __riscv_vfcvt_f(__riscv_vzext_vf4(src0, vl), vl); + v13 = __riscv_vfcvt_f(__riscv_vzext_vf4(src1, vl), vl); + v23 = __riscv_vfcvt_f(__riscv_vzext_vf4(src2, vl), vl); + + v00 = __riscv_vfmacc(v00, mx, __riscv_vfsub(v01, v00, vl), vl); + v02 = __riscv_vfmacc(v02, mx, __riscv_vfsub(v03, v02, vl), vl); + v00 = __riscv_vfmacc(v00, my, __riscv_vfsub(v02, v00, vl), vl); + v10 = __riscv_vfmacc(v10, mx, __riscv_vfsub(v11, v10, vl), vl); + v12 = __riscv_vfmacc(v12, mx, __riscv_vfsub(v13, v12, vl), vl); + v10 = __riscv_vfmacc(v10, my, __riscv_vfsub(v12, v10, vl), vl); + v20 = __riscv_vfmacc(v20, mx, __riscv_vfsub(v21, v20, vl), vl); + v22 = __riscv_vfmacc(v22, mx, __riscv_vfsub(v23, v22, vl), vl); + v20 = __riscv_vfmacc(v20, my, __riscv_vfsub(v22, v20, vl), vl); + vuint8mf2x3_t dst{}; + dst = __riscv_vset_v_u8mf2_u8mf2x3(dst, 0, __riscv_vnclipu(__riscv_vfncvt_xu(v00, vl), 0, __RISCV_VXRM_RNU, vl)); + dst = __riscv_vset_v_u8mf2_u8mf2x3(dst, 1, __riscv_vnclipu(__riscv_vfncvt_xu(v10, vl), 0, __RISCV_VXRM_RNU, vl)); + dst = __riscv_vset_v_u8mf2_u8mf2x3(dst, 2, __riscv_vnclipu(__riscv_vfncvt_xu(v20, vl), 0, __RISCV_VXRM_RNU, vl)); + __riscv_vsseg3e8(dst_data + i * dst_step + j * 3, dst, vl); + } + } + } + + return CV_HAL_ERROR_OK; +} + +static inline int remap32fC4(int start, int end, const uchar *src_data, size_t src_step, int src_width, int src_height, + uchar *dst_data, size_t dst_step, int dst_width, + const float* mapx, size_t mapx_step, const float* mapy, size_t mapy_step, + int interpolation, int border_type, const double* border_value) +{ + for (int i = start; i < end; i++) + { + int vl; + for (int j = 0; j < dst_width; j += vl) + { + vl = __riscv_vsetvl_e8mf2(dst_width - j); + vfloat32m2_t mx, my; + if (mapy == nullptr) + { + auto map = __riscv_vlseg2e32_v_f32m2x2(mapx + i * mapx_step + j * 2, vl); + mx = __riscv_vget_v_f32m2x2_f32m2(map, 0); + my = __riscv_vget_v_f32m2x2_f32m2(map, 1); + } + else + { + mx = __riscv_vle32_v_f32m2(mapx + i * mapx_step + j, vl); + my = __riscv_vle32_v_f32m2(mapy + i * mapy_step + j, vl); + } + if (interpolation & CV_HAL_WARP_RELATIVE_MAP) + { + mx = __riscv_vfadd(mx, __riscv_vfcvt_f(__riscv_vadd(__riscv_vid_v_u32m2(vl), j, vl), vl), vl); + my = __riscv_vfadd(my, i, vl); + } + + auto access = [&](vint32m2_t ix, vint32m2_t iy, vuint8mf2_t& src0, vuint8mf2_t& src1, vuint8mf2_t& src2, vuint8mf2_t& src3) { + auto ux = __riscv_vreinterpret_v_i32m2_u32m2(__riscv_vmin(__riscv_vmax(ix, 0, vl), src_width - 1, vl)); + auto uy = __riscv_vreinterpret_v_i32m2_u32m2(__riscv_vmin(__riscv_vmax(iy, 0, vl), src_height - 1, vl)); + auto src = __riscv_vloxseg4ei32_v_u8mf2x4(src_data, __riscv_vmadd(uy, src_step, __riscv_vmul(ux, 4, vl), vl), vl); + src0 = __riscv_vget_v_u8mf2x4_u8mf2(src, 0); + src1 = __riscv_vget_v_u8mf2x4_u8mf2(src, 1); + src2 = __riscv_vget_v_u8mf2x4_u8mf2(src, 2); + src3 = __riscv_vget_v_u8mf2x4_u8mf2(src, 3); + if (border_type == CV_HAL_BORDER_CONSTANT) + { + auto mask = __riscv_vmor(__riscv_vmsne(ix, __riscv_vreinterpret_v_u32m2_i32m2(ux), vl), __riscv_vmsne(iy, __riscv_vreinterpret_v_u32m2_i32m2(uy), vl), vl); + src0 = __riscv_vmerge(src0, border_value[0], mask, vl); + src1 = __riscv_vmerge(src1, border_value[1], mask, vl); + src2 = __riscv_vmerge(src2, border_value[2], mask, vl); + src3 = __riscv_vmerge(src3, border_value[3], mask, vl); + } + }; + if ((interpolation & ~CV_HAL_WARP_RELATIVE_MAP) == CV_HAL_INTER_NEAREST) + { + auto ix = __riscv_vfcvt_x(mx, vl), iy = __riscv_vfcvt_x(my, vl); + vuint8mf2_t src0, src1, src2, src3; + access(ix, iy, src0, src1, src2, src3); + vuint8mf2x4_t dst{}; + dst = __riscv_vset_v_u8mf2_u8mf2x4(dst, 0, src0); + dst = __riscv_vset_v_u8mf2_u8mf2x4(dst, 1, src1); + dst = __riscv_vset_v_u8mf2_u8mf2x4(dst, 2, src2); + dst = __riscv_vset_v_u8mf2_u8mf2x4(dst, 3, src3); + __riscv_vsseg4e8(dst_data + i * dst_step + j * 4, dst, vl); + } + else + { + auto imx = __riscv_vfcvt_x(__riscv_vfmul(mx, 32, vl), vl); + auto imy = __riscv_vfcvt_x(__riscv_vfmul(my, 32, vl), vl); + auto ix0 = __riscv_vsra(imx, 5, vl); + auto iy0 = __riscv_vsra(imy, 5, vl); + auto ix1 = __riscv_vadd(ix0, 1, vl), iy1 = __riscv_vadd(iy0, 1, vl); + mx = __riscv_vfdiv(__riscv_vfcvt_f(__riscv_vand(imx, 31, vl), vl), 32, vl); + my = __riscv_vfdiv(__riscv_vfcvt_f(__riscv_vand(imy, 31, vl), vl), 32, vl); + + vfloat32m2_t v00, v10, v20, v30; + vfloat32m2_t v01, v11, v21, v31; + vfloat32m2_t v02, v12, v22, v32; + vfloat32m2_t v03, v13, v23, v33; + vuint8mf2_t src0, src1, src2, src3; + access(ix0, iy0, src0, src1, src2, src3); + v00 = __riscv_vfcvt_f(__riscv_vzext_vf4(src0, vl), vl); + v10 = __riscv_vfcvt_f(__riscv_vzext_vf4(src1, vl), vl); + v20 = __riscv_vfcvt_f(__riscv_vzext_vf4(src2, vl), vl); + v30 = __riscv_vfcvt_f(__riscv_vzext_vf4(src3, vl), vl); + access(ix1, iy0, src0, src1, src2, src3); + v01 = __riscv_vfcvt_f(__riscv_vzext_vf4(src0, vl), vl); + v11 = __riscv_vfcvt_f(__riscv_vzext_vf4(src1, vl), vl); + v21 = __riscv_vfcvt_f(__riscv_vzext_vf4(src2, vl), vl); + v31 = __riscv_vfcvt_f(__riscv_vzext_vf4(src3, vl), vl); + access(ix0, iy1, src0, src1, src2, src3); + v02 = __riscv_vfcvt_f(__riscv_vzext_vf4(src0, vl), vl); + v12 = __riscv_vfcvt_f(__riscv_vzext_vf4(src1, vl), vl); + v22 = __riscv_vfcvt_f(__riscv_vzext_vf4(src2, vl), vl); + v32 = __riscv_vfcvt_f(__riscv_vzext_vf4(src3, vl), vl); + access(ix1, iy1, src0, src1, src2, src3); + v03 = __riscv_vfcvt_f(__riscv_vzext_vf4(src0, vl), vl); + v13 = __riscv_vfcvt_f(__riscv_vzext_vf4(src1, vl), vl); + v23 = __riscv_vfcvt_f(__riscv_vzext_vf4(src2, vl), vl); + v33 = __riscv_vfcvt_f(__riscv_vzext_vf4(src3, vl), vl); + + v00 = __riscv_vfmacc(v00, mx, __riscv_vfsub(v01, v00, vl), vl); + v02 = __riscv_vfmacc(v02, mx, __riscv_vfsub(v03, v02, vl), vl); + v00 = __riscv_vfmacc(v00, my, __riscv_vfsub(v02, v00, vl), vl); + v10 = __riscv_vfmacc(v10, mx, __riscv_vfsub(v11, v10, vl), vl); + v12 = __riscv_vfmacc(v12, mx, __riscv_vfsub(v13, v12, vl), vl); + v10 = __riscv_vfmacc(v10, my, __riscv_vfsub(v12, v10, vl), vl); + v20 = __riscv_vfmacc(v20, mx, __riscv_vfsub(v21, v20, vl), vl); + v22 = __riscv_vfmacc(v22, mx, __riscv_vfsub(v23, v22, vl), vl); + v20 = __riscv_vfmacc(v20, my, __riscv_vfsub(v22, v20, vl), vl); + v30 = __riscv_vfmacc(v30, mx, __riscv_vfsub(v31, v30, vl), vl); + v32 = __riscv_vfmacc(v32, mx, __riscv_vfsub(v33, v32, vl), vl); + v30 = __riscv_vfmacc(v30, my, __riscv_vfsub(v32, v30, vl), vl); + vuint8mf2x4_t dst{}; + dst = __riscv_vset_v_u8mf2_u8mf2x4(dst, 0, __riscv_vnclipu(__riscv_vfncvt_xu(v00, vl), 0, __RISCV_VXRM_RNU, vl)); + dst = __riscv_vset_v_u8mf2_u8mf2x4(dst, 1, __riscv_vnclipu(__riscv_vfncvt_xu(v10, vl), 0, __RISCV_VXRM_RNU, vl)); + dst = __riscv_vset_v_u8mf2_u8mf2x4(dst, 2, __riscv_vnclipu(__riscv_vfncvt_xu(v20, vl), 0, __RISCV_VXRM_RNU, vl)); + dst = __riscv_vset_v_u8mf2_u8mf2x4(dst, 3, __riscv_vnclipu(__riscv_vfncvt_xu(v30, vl), 0, __RISCV_VXRM_RNU, vl)); + __riscv_vsseg4e8(dst_data + i * dst_step + j * 4, dst, vl); + } + } + } + + return CV_HAL_ERROR_OK; +} + +// the algorithm is copied from 3rdparty/carotene/src/remap.cpp, +// in the function void CAROTENE_NS::remapNearestNeighbor and void CAROTENE_NS::remapLinear +template +inline int remap32f(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, + uchar *dst_data, size_t dst_step, int dst_width, int dst_height, + float* mapx, size_t mapx_step, float* mapy, size_t mapy_step, + int interpolation, int border_type, const double border_value[4]) +{ + if (src_type != CV_8UC1 && src_type != CV_8UC3 && src_type != CV_8UC4 && src_type != CV_16UC1 && src_type != CV_16SC1 && src_type != CV_32FC1) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + if (border_type != CV_HAL_BORDER_CONSTANT && border_type != CV_HAL_BORDER_REPLICATE) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + + const int mode = interpolation & ~CV_HAL_WARP_RELATIVE_MAP; + if (mode != CV_HAL_INTER_NEAREST && mode != CV_HAL_INTER_LINEAR && mode != CV_HAL_INTER_CUBIC && mode != CV_HAL_INTER_LANCZOS4) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + if ((mode == CV_HAL_INTER_CUBIC || mode == CV_HAL_INTER_LANCZOS4) && CV_MAKETYPE(src_type, 1) != src_type) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + + mapx_step /= s16 ? sizeof(short) : sizeof(float); + mapy_step /= s16 ? sizeof(ushort) : sizeof(float); + switch (src_type) + { + case CV_8UC3: + return invoke(dst_width, dst_height, {remap32fC3}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, mapx, mapx_step, mapy, mapy_step, interpolation, border_type, border_value); + case CV_8UC4: + return invoke(dst_width, dst_height, {remap32fC4}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, mapx, mapx_step, mapy, mapy_step, interpolation, border_type, border_value); + } + switch (mode*100 + src_type) + { + case CV_HAL_INTER_NEAREST*100 + CV_8UC1: + case CV_HAL_INTER_LINEAR*100 + CV_8UC1: + return invoke(dst_width, dst_height, {remap32fC1}, s16, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, mapx, mapx_step, mapy, mapy_step, interpolation, border_type, border_value); + case CV_HAL_INTER_NEAREST*100 + CV_16UC1: + case CV_HAL_INTER_LINEAR*100 + CV_16UC1: + return invoke(dst_width, dst_height, {remap32fC1}, s16, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, mapx, mapx_step, mapy, mapy_step, interpolation, border_type, border_value); + case CV_HAL_INTER_NEAREST*100 + CV_16SC1: + case CV_HAL_INTER_LINEAR*100 + CV_16SC1: + return invoke(dst_width, dst_height, {remap32fC1}, s16, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, mapx, mapx_step, mapy, mapy_step, interpolation, border_type, border_value); + case CV_HAL_INTER_NEAREST*100 + CV_32FC1: + case CV_HAL_INTER_LINEAR*100 + CV_32FC1: + return invoke(dst_width, dst_height, {remap32fC1}, s16, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, mapx, mapx_step, mapy, mapy_step, interpolation, border_type, border_value); + + case CV_HAL_INTER_CUBIC*100 + CV_8UC1: + return invoke(dst_width, dst_height, {remap32fCubic}, s16, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, mapx, mapx_step, mapy, mapy_step, interpolation, border_type, border_value); + case CV_HAL_INTER_CUBIC*100 + CV_16UC1: + return invoke(dst_width, dst_height, {remap32fCubic}, s16, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, mapx, mapx_step, mapy, mapy_step, interpolation, border_type, border_value); + case CV_HAL_INTER_CUBIC*100 + CV_16SC1: + return invoke(dst_width, dst_height, {remap32fCubic}, s16, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, mapx, mapx_step, mapy, mapy_step, interpolation, border_type, border_value); + case CV_HAL_INTER_CUBIC*100 + CV_32FC1: + return invoke(dst_width, dst_height, {remap32fCubic}, s16, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, mapx, mapx_step, mapy, mapy_step, interpolation, border_type, border_value); + + // Lanczos4 is disabled in clang since register allocation strategy is buggy in clang 20.0 + // remove this #ifndef in the future if possible +#ifndef __clang__ + case CV_HAL_INTER_LANCZOS4*100 + CV_8UC1: + return invoke(dst_width, dst_height, {remap32fLanczos4}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, mapx, mapx_step, mapy, mapy_step, interpolation, border_type, border_value); + // disabled since UI is fast enough + // case CV_HAL_INTER_LANCZOS4*100 + CV_16UC1: + // return invoke(dst_width, dst_height, {remap32fLanczos4}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, mapx, mapx_step, mapy, mapy_step, interpolation, border_type, border_value); + case CV_HAL_INTER_LANCZOS4*100 + CV_16SC1: + return invoke(dst_width, dst_height, {remap32fLanczos4}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, mapx, mapx_step, mapy, mapy_step, interpolation, border_type, border_value); + case CV_HAL_INTER_LANCZOS4*100 + CV_32FC1: + return invoke(dst_width, dst_height, {remap32fLanczos4}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, mapx, mapx_step, mapy, mapy_step, interpolation, border_type, border_value); +#endif + } + + return CV_HAL_ERROR_NOT_IMPLEMENTED; +} + +inline int remap32fc2(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, + uchar *dst_data, size_t dst_step, int dst_width, int dst_height, + float* map, size_t map_step, int interpolation, int border_type, const double border_value[4]) +{ + return remap32f(src_type, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, dst_height, map, map_step, nullptr, 0, interpolation, border_type, border_value); +} + +inline int remap16s(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, + uchar *dst_data, size_t dst_step, int dst_width, int dst_height, + short* mapx, size_t mapx_step, ushort* mapy, size_t mapy_step, + int interpolation, int border_type, const double border_value[4]) +{ + if (CV_MAKETYPE(src_type, 1) != src_type) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + return remap32f(src_type, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, dst_height, reinterpret_cast(mapx), mapx_step, reinterpret_cast(mapy), mapy_step, interpolation, border_type, border_value); +} +} // cv::cv_hal_rvv::remap + +namespace warp { +#undef cv_hal_warpAffine +#define cv_hal_warpAffine cv::cv_hal_rvv::warp::warpAffine +#undef cv_hal_warpPerspective +#define cv_hal_warpPerspective cv::cv_hal_rvv::warp::warpPerspective + +template +static inline int warpC1(int start, int end, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, const double* M, int interpolation, int borderType, const double* borderValue) +{ + for (int i = start; i < end; i++) + { + int vl; + for (int j = 0; j < dst_width; j += vl) + { + vl = __riscv_vsetvl_e8m1(dst_width - j); + auto access = [&](vint32m4_t ix, vint32m4_t iy) { + auto ux = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vmin(__riscv_vmax(ix, 0, vl), src_width - 1, vl)); + auto uy = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vmin(__riscv_vmax(iy, 0, vl), src_height - 1, vl)); + auto src = __riscv_vloxei32_v_u8m1(src_data, __riscv_vmadd(uy, src_step, ux, vl), vl); + if (borderType == CV_HAL_BORDER_CONSTANT) + { + auto mask = __riscv_vmor(__riscv_vmsne(ix, __riscv_vreinterpret_v_u32m4_i32m4(ux), vl), __riscv_vmsne(iy, __riscv_vreinterpret_v_u32m4_i32m4(uy), vl), vl); + src = __riscv_vmerge(src, borderValue[0], mask, vl); + } + return src; + }; + + auto id = __riscv_vfcvt_f(__riscv_vadd(__riscv_vid_v_u32m4(vl), j, vl), vl); + auto mx = __riscv_vfmadd(id, M[0], __riscv_vfmadd(__riscv_vfmv_v_f_f32m4(i, vl), M[1], __riscv_vfmv_v_f_f32m4(M[2], vl), vl), vl); + auto my = __riscv_vfmadd(id, M[3], __riscv_vfmadd(__riscv_vfmv_v_f_f32m4(i, vl), M[4], __riscv_vfmv_v_f_f32m4(M[5], vl), vl), vl); + if (perspective) + { + auto md = __riscv_vfrdiv(__riscv_vfmadd(id, M[6], __riscv_vfmadd(__riscv_vfmv_v_f_f32m4(i, vl), M[7], __riscv_vfmv_v_f_f32m4(M[8], vl), vl), vl), 1, vl); + mx = __riscv_vfmul(mx, md, vl); + my = __riscv_vfmul(my, md, vl); + } + + if (interpolation == CV_HAL_INTER_NEAREST) + { + auto ix = __riscv_vfcvt_x(mx, vl), iy = __riscv_vfcvt_x(my, vl); + __riscv_vse8(dst_data + i * dst_step + j, access(ix, iy), vl); + } + else + { + auto ix = __riscv_vfcvt_x(__riscv_vfmadd(mx, 1 << 10, __riscv_vfmv_v_f_f32m4(1 << 4, vl), vl), vl); + auto iy = __riscv_vfcvt_x(__riscv_vfmadd(my, 1 << 10, __riscv_vfmv_v_f_f32m4(1 << 4, vl), vl), vl); + auto ix0 = __riscv_vsra(ix, 10, vl), iy0 = __riscv_vsra(iy, 10, vl); + auto ix1 = __riscv_vadd(ix0, 1, vl), iy1 = __riscv_vadd(iy0, 1, vl); + + auto v0 = __riscv_vzext_vf4(access(ix0, iy0), vl); + auto v1 = __riscv_vzext_vf4(access(ix1, iy0), vl); + auto v2 = __riscv_vzext_vf4(access(ix0, iy1), vl); + auto v3 = __riscv_vzext_vf4(access(ix1, iy1), vl); + + auto rx = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vand(__riscv_vsra(ix, 5, vl), (1 << 5) - 1, vl)); + auto ry = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vand(__riscv_vsra(iy, 5, vl), (1 << 5) - 1, vl)); + v0 = __riscv_vmacc(__riscv_vmul(v0, 1 << 5, vl), rx, __riscv_vsub(v1, v0, vl), vl); + v2 = __riscv_vmacc(__riscv_vmul(v2, 1 << 5, vl), rx, __riscv_vsub(v3, v2, vl), vl); + v0 = __riscv_vmacc(__riscv_vmul(v0, 1 << 5, vl), ry, __riscv_vsub(v2, v0, vl), vl); + __riscv_vse8(dst_data + i * dst_step + j, __riscv_vnclipu(__riscv_vnclipu(v0, 10, __RISCV_VXRM_RNU, vl), 0, __RISCV_VXRM_RNU, vl), vl); + } + } + } + + return CV_HAL_ERROR_OK; +} + +template +static inline int warpC3(int start, int end, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, const double* M, int interpolation, int borderType, const double* borderValue) +{ + for (int i = start; i < end; i++) + { + int vl; + for (int j = 0; j < dst_width; j += vl) + { + vl = __riscv_vsetvl_e8mf2(dst_width - j); + auto access = [&](vint32m2_t ix, vint32m2_t iy, vuint8mf2_t& src0, vuint8mf2_t& src1, vuint8mf2_t& src2) { + auto ux = __riscv_vreinterpret_v_i32m2_u32m2(__riscv_vmin(__riscv_vmax(ix, 0, vl), src_width - 1, vl)); + auto uy = __riscv_vreinterpret_v_i32m2_u32m2(__riscv_vmin(__riscv_vmax(iy, 0, vl), src_height - 1, vl)); + auto src = __riscv_vloxseg3ei32_v_u8mf2x3(src_data, __riscv_vmadd(uy, src_step, __riscv_vmul(ux, 3, vl), vl), vl); + src0 = __riscv_vget_v_u8mf2x3_u8mf2(src, 0); + src1 = __riscv_vget_v_u8mf2x3_u8mf2(src, 1); + src2 = __riscv_vget_v_u8mf2x3_u8mf2(src, 2); + if (borderType == CV_HAL_BORDER_CONSTANT) + { + auto mask = __riscv_vmor(__riscv_vmsne(ix, __riscv_vreinterpret_v_u32m2_i32m2(ux), vl), __riscv_vmsne(iy, __riscv_vreinterpret_v_u32m2_i32m2(uy), vl), vl); + src0 = __riscv_vmerge(src0, borderValue[0], mask, vl); + src1 = __riscv_vmerge(src1, borderValue[1], mask, vl); + src2 = __riscv_vmerge(src2, borderValue[2], mask, vl); + } + }; + + auto id = __riscv_vfcvt_f(__riscv_vadd(__riscv_vid_v_u32m2(vl), j, vl), vl); + auto mx = __riscv_vfmadd(id, M[0], __riscv_vfmadd(__riscv_vfmv_v_f_f32m2(i, vl), M[1], __riscv_vfmv_v_f_f32m2(M[2], vl), vl), vl); + auto my = __riscv_vfmadd(id, M[3], __riscv_vfmadd(__riscv_vfmv_v_f_f32m2(i, vl), M[4], __riscv_vfmv_v_f_f32m2(M[5], vl), vl), vl); + if (perspective) + { + auto md = __riscv_vfrdiv(__riscv_vfmadd(id, M[6], __riscv_vfmadd(__riscv_vfmv_v_f_f32m2(i, vl), M[7], __riscv_vfmv_v_f_f32m2(M[8], vl), vl), vl), 1, vl); + mx = __riscv_vfmul(mx, md, vl); + my = __riscv_vfmul(my, md, vl); + } + + if (interpolation == CV_HAL_INTER_NEAREST) + { + auto ix = __riscv_vfcvt_x(mx, vl), iy = __riscv_vfcvt_x(my, vl); + vuint8mf2_t src0, src1, src2; + access(ix, iy, src0, src1, src2); + vuint8mf2x3_t dst{}; + dst = __riscv_vset_v_u8mf2_u8mf2x3(dst, 0, src0); + dst = __riscv_vset_v_u8mf2_u8mf2x3(dst, 1, src1); + dst = __riscv_vset_v_u8mf2_u8mf2x3(dst, 2, src2); + __riscv_vsseg3e8(dst_data + i * dst_step + j * 3, dst, vl); + } + else + { + auto ix = __riscv_vfcvt_x(__riscv_vfmadd(mx, 1 << 10, __riscv_vfmv_v_f_f32m2(1 << 4, vl), vl), vl); + auto iy = __riscv_vfcvt_x(__riscv_vfmadd(my, 1 << 10, __riscv_vfmv_v_f_f32m2(1 << 4, vl), vl), vl); + auto ix0 = __riscv_vsra(ix, 10, vl), iy0 = __riscv_vsra(iy, 10, vl); + auto ix1 = __riscv_vadd(ix0, 1, vl), iy1 = __riscv_vadd(iy0, 1, vl); + + vuint32m2_t v00, v10, v20; + vuint32m2_t v01, v11, v21; + vuint32m2_t v02, v12, v22; + vuint32m2_t v03, v13, v23; + vuint8mf2_t src0, src1, src2; + access(ix0, iy0, src0, src1, src2); + v00 = __riscv_vzext_vf4(src0, vl); + v10 = __riscv_vzext_vf4(src1, vl); + v20 = __riscv_vzext_vf4(src2, vl); + access(ix1, iy0, src0, src1, src2); + v01 = __riscv_vzext_vf4(src0, vl); + v11 = __riscv_vzext_vf4(src1, vl); + v21 = __riscv_vzext_vf4(src2, vl); + access(ix0, iy1, src0, src1, src2); + v02 = __riscv_vzext_vf4(src0, vl); + v12 = __riscv_vzext_vf4(src1, vl); + v22 = __riscv_vzext_vf4(src2, vl); + access(ix1, iy1, src0, src1, src2); + v03 = __riscv_vzext_vf4(src0, vl); + v13 = __riscv_vzext_vf4(src1, vl); + v23 = __riscv_vzext_vf4(src2, vl); + + auto rx = __riscv_vreinterpret_v_i32m2_u32m2(__riscv_vand(__riscv_vsra(ix, 5, vl), (1 << 5) - 1, vl)); + auto ry = __riscv_vreinterpret_v_i32m2_u32m2(__riscv_vand(__riscv_vsra(iy, 5, vl), (1 << 5) - 1, vl)); + v00 = __riscv_vmacc(__riscv_vmul(v00, 1 << 5, vl), rx, __riscv_vsub(v01, v00, vl), vl); + v02 = __riscv_vmacc(__riscv_vmul(v02, 1 << 5, vl), rx, __riscv_vsub(v03, v02, vl), vl); + v00 = __riscv_vmacc(__riscv_vmul(v00, 1 << 5, vl), ry, __riscv_vsub(v02, v00, vl), vl); + v10 = __riscv_vmacc(__riscv_vmul(v10, 1 << 5, vl), rx, __riscv_vsub(v11, v10, vl), vl); + v12 = __riscv_vmacc(__riscv_vmul(v12, 1 << 5, vl), rx, __riscv_vsub(v13, v12, vl), vl); + v10 = __riscv_vmacc(__riscv_vmul(v10, 1 << 5, vl), ry, __riscv_vsub(v12, v10, vl), vl); + v20 = __riscv_vmacc(__riscv_vmul(v20, 1 << 5, vl), rx, __riscv_vsub(v21, v20, vl), vl); + v22 = __riscv_vmacc(__riscv_vmul(v22, 1 << 5, vl), rx, __riscv_vsub(v23, v22, vl), vl); + v20 = __riscv_vmacc(__riscv_vmul(v20, 1 << 5, vl), ry, __riscv_vsub(v22, v20, vl), vl); + vuint8mf2x3_t dst{}; + dst = __riscv_vset_v_u8mf2_u8mf2x3(dst, 0, __riscv_vnclipu(__riscv_vnclipu(v00, 10, __RISCV_VXRM_RNU, vl), 0, __RISCV_VXRM_RNU, vl)); + dst = __riscv_vset_v_u8mf2_u8mf2x3(dst, 1, __riscv_vnclipu(__riscv_vnclipu(v10, 10, __RISCV_VXRM_RNU, vl), 0, __RISCV_VXRM_RNU, vl)); + dst = __riscv_vset_v_u8mf2_u8mf2x3(dst, 2, __riscv_vnclipu(__riscv_vnclipu(v20, 10, __RISCV_VXRM_RNU, vl), 0, __RISCV_VXRM_RNU, vl)); + __riscv_vsseg3e8(dst_data + i * dst_step + j * 3, dst, vl); + } + } + } + + return CV_HAL_ERROR_OK; +} + +template +static inline int warpC4(int start, int end, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, const double* M, int interpolation, int borderType, const double* borderValue) +{ + for (int i = start; i < end; i++) + { + int vl; + for (int j = 0; j < dst_width; j += vl) + { + vl = __riscv_vsetvl_e8mf2(dst_width - j); + auto access = [&](vint32m2_t ix, vint32m2_t iy, vuint8mf2_t& src0, vuint8mf2_t& src1, vuint8mf2_t& src2, vuint8mf2_t& src3) { + auto ux = __riscv_vreinterpret_v_i32m2_u32m2(__riscv_vmin(__riscv_vmax(ix, 0, vl), src_width - 1, vl)); + auto uy = __riscv_vreinterpret_v_i32m2_u32m2(__riscv_vmin(__riscv_vmax(iy, 0, vl), src_height - 1, vl)); + auto src = __riscv_vloxseg4ei32_v_u8mf2x4(src_data, __riscv_vmadd(uy, src_step, __riscv_vmul(ux, 4, vl), vl), vl); + src0 = __riscv_vget_v_u8mf2x4_u8mf2(src, 0); + src1 = __riscv_vget_v_u8mf2x4_u8mf2(src, 1); + src2 = __riscv_vget_v_u8mf2x4_u8mf2(src, 2); + src3 = __riscv_vget_v_u8mf2x4_u8mf2(src, 3); + if (borderType == CV_HAL_BORDER_CONSTANT) + { + auto mask = __riscv_vmor(__riscv_vmsne(ix, __riscv_vreinterpret_v_u32m2_i32m2(ux), vl), __riscv_vmsne(iy, __riscv_vreinterpret_v_u32m2_i32m2(uy), vl), vl); + src0 = __riscv_vmerge(src0, borderValue[0], mask, vl); + src1 = __riscv_vmerge(src1, borderValue[1], mask, vl); + src2 = __riscv_vmerge(src2, borderValue[2], mask, vl); + src3 = __riscv_vmerge(src3, borderValue[3], mask, vl); + } + }; + + auto id = __riscv_vfcvt_f(__riscv_vadd(__riscv_vid_v_u32m2(vl), j, vl), vl); + auto mx = __riscv_vfmadd(id, M[0], __riscv_vfmadd(__riscv_vfmv_v_f_f32m2(i, vl), M[1], __riscv_vfmv_v_f_f32m2(M[2], vl), vl), vl); + auto my = __riscv_vfmadd(id, M[3], __riscv_vfmadd(__riscv_vfmv_v_f_f32m2(i, vl), M[4], __riscv_vfmv_v_f_f32m2(M[5], vl), vl), vl); + if (perspective) + { + auto md = __riscv_vfrdiv(__riscv_vfmadd(id, M[6], __riscv_vfmadd(__riscv_vfmv_v_f_f32m2(i, vl), M[7], __riscv_vfmv_v_f_f32m2(M[8], vl), vl), vl), 1, vl); + mx = __riscv_vfmul(mx, md, vl); + my = __riscv_vfmul(my, md, vl); + } + + if (interpolation == CV_HAL_INTER_NEAREST) + { + auto ix = __riscv_vfcvt_x(mx, vl), iy = __riscv_vfcvt_x(my, vl); + vuint8mf2_t src0, src1, src2, src3; + access(ix, iy, src0, src1, src2, src3); + vuint8mf2x4_t dst{}; + dst = __riscv_vset_v_u8mf2_u8mf2x4(dst, 0, src0); + dst = __riscv_vset_v_u8mf2_u8mf2x4(dst, 1, src1); + dst = __riscv_vset_v_u8mf2_u8mf2x4(dst, 2, src2); + dst = __riscv_vset_v_u8mf2_u8mf2x4(dst, 3, src3); + __riscv_vsseg4e8(dst_data + i * dst_step + j * 4, dst, vl); + } + else + { + auto ix = __riscv_vfcvt_x(__riscv_vfmadd(mx, 1 << 10, __riscv_vfmv_v_f_f32m2(1 << 4, vl), vl), vl); + auto iy = __riscv_vfcvt_x(__riscv_vfmadd(my, 1 << 10, __riscv_vfmv_v_f_f32m2(1 << 4, vl), vl), vl); + auto ix0 = __riscv_vsra(ix, 10, vl), iy0 = __riscv_vsra(iy, 10, vl); + auto ix1 = __riscv_vadd(ix0, 1, vl), iy1 = __riscv_vadd(iy0, 1, vl); + + vuint32m2_t v00, v10, v20, v30; + vuint32m2_t v01, v11, v21, v31; + vuint32m2_t v02, v12, v22, v32; + vuint32m2_t v03, v13, v23, v33; + vuint8mf2_t src0, src1, src2, src3; + access(ix0, iy0, src0, src1, src2, src3); + v00 = __riscv_vzext_vf4(src0, vl); + v10 = __riscv_vzext_vf4(src1, vl); + v20 = __riscv_vzext_vf4(src2, vl); + v30 = __riscv_vzext_vf4(src3, vl); + access(ix1, iy0, src0, src1, src2, src3); + v01 = __riscv_vzext_vf4(src0, vl); + v11 = __riscv_vzext_vf4(src1, vl); + v21 = __riscv_vzext_vf4(src2, vl); + v31 = __riscv_vzext_vf4(src3, vl); + access(ix0, iy1, src0, src1, src2, src3); + v02 = __riscv_vzext_vf4(src0, vl); + v12 = __riscv_vzext_vf4(src1, vl); + v22 = __riscv_vzext_vf4(src2, vl); + v32 = __riscv_vzext_vf4(src3, vl); + access(ix1, iy1, src0, src1, src2, src3); + v03 = __riscv_vzext_vf4(src0, vl); + v13 = __riscv_vzext_vf4(src1, vl); + v23 = __riscv_vzext_vf4(src2, vl); + v33 = __riscv_vzext_vf4(src3, vl); + + auto rx = __riscv_vreinterpret_v_i32m2_u32m2(__riscv_vand(__riscv_vsra(ix, 5, vl), (1 << 5) - 1, vl)); + auto ry = __riscv_vreinterpret_v_i32m2_u32m2(__riscv_vand(__riscv_vsra(iy, 5, vl), (1 << 5) - 1, vl)); + v00 = __riscv_vmacc(__riscv_vmul(v00, 1 << 5, vl), rx, __riscv_vsub(v01, v00, vl), vl); + v02 = __riscv_vmacc(__riscv_vmul(v02, 1 << 5, vl), rx, __riscv_vsub(v03, v02, vl), vl); + v00 = __riscv_vmacc(__riscv_vmul(v00, 1 << 5, vl), ry, __riscv_vsub(v02, v00, vl), vl); + v10 = __riscv_vmacc(__riscv_vmul(v10, 1 << 5, vl), rx, __riscv_vsub(v11, v10, vl), vl); + v12 = __riscv_vmacc(__riscv_vmul(v12, 1 << 5, vl), rx, __riscv_vsub(v13, v12, vl), vl); + v10 = __riscv_vmacc(__riscv_vmul(v10, 1 << 5, vl), ry, __riscv_vsub(v12, v10, vl), vl); + v20 = __riscv_vmacc(__riscv_vmul(v20, 1 << 5, vl), rx, __riscv_vsub(v21, v20, vl), vl); + v22 = __riscv_vmacc(__riscv_vmul(v22, 1 << 5, vl), rx, __riscv_vsub(v23, v22, vl), vl); + v20 = __riscv_vmacc(__riscv_vmul(v20, 1 << 5, vl), ry, __riscv_vsub(v22, v20, vl), vl); + v30 = __riscv_vmacc(__riscv_vmul(v30, 1 << 5, vl), rx, __riscv_vsub(v31, v30, vl), vl); + v32 = __riscv_vmacc(__riscv_vmul(v32, 1 << 5, vl), rx, __riscv_vsub(v33, v32, vl), vl); + v30 = __riscv_vmacc(__riscv_vmul(v30, 1 << 5, vl), ry, __riscv_vsub(v32, v30, vl), vl); + vuint8mf2x4_t dst{}; + dst = __riscv_vset_v_u8mf2_u8mf2x4(dst, 0, __riscv_vnclipu(__riscv_vnclipu(v00, 10, __RISCV_VXRM_RNU, vl), 0, __RISCV_VXRM_RNU, vl)); + dst = __riscv_vset_v_u8mf2_u8mf2x4(dst, 1, __riscv_vnclipu(__riscv_vnclipu(v10, 10, __RISCV_VXRM_RNU, vl), 0, __RISCV_VXRM_RNU, vl)); + dst = __riscv_vset_v_u8mf2_u8mf2x4(dst, 2, __riscv_vnclipu(__riscv_vnclipu(v20, 10, __RISCV_VXRM_RNU, vl), 0, __RISCV_VXRM_RNU, vl)); + dst = __riscv_vset_v_u8mf2_u8mf2x4(dst, 3, __riscv_vnclipu(__riscv_vnclipu(v30, 10, __RISCV_VXRM_RNU, vl), 0, __RISCV_VXRM_RNU, vl)); + __riscv_vsseg4e8(dst_data + i * dst_step + j * 4, dst, vl); + } + } + } + + return CV_HAL_ERROR_OK; +} + +// the algorithm is copied from 3rdparty/carotene/src/warp_affine.cpp, +// in the function void CAROTENE_NS::warpAffineNearestNeighbor and void CAROTENE_NS::warpAffineLinear +inline int warpAffine(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, int dst_height, const double M[6], int interpolation, int borderType, const double borderValue[4]) +{ + if (src_type != CV_8UC1 && src_type != CV_8UC3 && src_type != CV_8UC4) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + if (borderType != CV_HAL_BORDER_CONSTANT && borderType != CV_HAL_BORDER_REPLICATE) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + if (interpolation != CV_HAL_INTER_NEAREST && interpolation != CV_HAL_INTER_LINEAR) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + + switch (src_type) + { + case CV_8UC1: + return remap::invoke(dst_width, dst_height, {warpC1}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, M, interpolation, borderType, borderValue); + case CV_8UC3: + return remap::invoke(dst_width, dst_height, {warpC3}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, M, interpolation, borderType, borderValue); + case CV_8UC4: + return remap::invoke(dst_width, dst_height, {warpC4}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, M, interpolation, borderType, borderValue); + } + + return CV_HAL_ERROR_NOT_IMPLEMENTED; +} + +// the algorithm is copied from 3rdparty/carotene/src/warp_perspective.cpp, +// in the function void CAROTENE_NS::warpPerspectiveNearestNeighbor and void CAROTENE_NS::warpPerspectiveLinear +inline int warpPerspective(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, uchar *dst_data, size_t dst_step, int dst_width, int dst_height, const double M[9], int interpolation, int borderType, const double borderValue[4]) +{ + if (src_type != CV_8UC1 && src_type != CV_8UC3 && src_type != CV_8UC4) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + if (borderType != CV_HAL_BORDER_CONSTANT && borderType != CV_HAL_BORDER_REPLICATE) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + if (interpolation != CV_HAL_INTER_NEAREST && interpolation != CV_HAL_INTER_LINEAR) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + + switch (src_type) + { + case CV_8UC1: + return remap::invoke(dst_width, dst_height, {warpC1}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, M, interpolation, borderType, borderValue); + case CV_8UC3: + return remap::invoke(dst_width, dst_height, {warpC3}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, M, interpolation, borderType, borderValue); + case CV_8UC4: + return remap::invoke(dst_width, dst_height, {warpC4}, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, M, interpolation, borderType, borderValue); + } + + return CV_HAL_ERROR_NOT_IMPLEMENTED; +} +} // cv::cv_hal_rvv::warp + +}} + +#endif diff --git a/modules/imgproc/src/hal_replacement.hpp b/modules/imgproc/src/hal_replacement.hpp index 26aa58e77e..645e9557ed 100644 --- a/modules/imgproc/src/hal_replacement.hpp +++ b/modules/imgproc/src/hal_replacement.hpp @@ -373,9 +373,58 @@ inline int hal_ni_remap32f(int src_type, const uchar *src_data, size_t src_step, float* mapx, size_t mapx_step, float* mapy, size_t mapy_step, int interpolation, int border_type, const double border_value[4]) { return CV_HAL_ERROR_NOT_IMPLEMENTED; } +/** + @brief hal_remap with floating point maps + @param src_type source and destination image type + @param src_data source image data + @param src_step source image step + @param src_width source image width + @param src_height source image height + @param dst_data destination image data + @param dst_step destination image step + @param dst_width destination image width + @param dst_height destination image height + @param map map for xy values + @param map_step map matrix step + @param interpolation interpolation mode (CV_HAL_INTER_NEAREST, ...) + @param border_type border processing mode (CV_HAL_BORDER_REFLECT, ...) + @param border_value values to use for CV_HAL_BORDER_CONSTANT mode + @sa cv::remap + */ +inline int hal_ni_remap32fc2(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, + uchar *dst_data, size_t dst_step, int dst_width, int dst_height, + float* map, size_t map_step, int interpolation, int border_type, const double border_value[4]) +{ return CV_HAL_ERROR_NOT_IMPLEMENTED; } +/** + @brief hal_remap with fixed-point maps + @param src_type source and destination image type + @param src_data source image data + @param src_step source image step + @param src_width source image width + @param src_height source image height + @param dst_data destination image data + @param dst_step destination image step + @param dst_width destination image width + @param dst_height destination image height + @param mapx map for x values + @param mapx_step mapx matrix step + @param mapy map for y values + @param mapy_step mapy matrix step + @param interpolation interpolation mode (CV_HAL_INTER_NEAREST, ...) + @param border_type border processing mode (CV_HAL_BORDER_REFLECT, ...) + @param border_value values to use for CV_HAL_BORDER_CONSTANT mode + @sa cv::remap + */ +inline int hal_ni_remap16s(int src_type, const uchar *src_data, size_t src_step, int src_width, int src_height, + uchar *dst_data, size_t dst_step, int dst_width, int dst_height, + short* mapx, size_t mapx_step, ushort* mapy, size_t mapy_step, + int interpolation, int border_type, const double border_value[4]) +{ return CV_HAL_ERROR_NOT_IMPLEMENTED; } //! @cond IGNORED #define cv_hal_remap32f hal_ni_remap32f +#define cv_hal_remap32fc2 hal_ni_remap32fc2 +#define cv_hal_remap16s hal_ni_remap16s //! @endcond /** diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp index 38f61333cf..f348860549 100644 --- a/modules/imgproc/src/imgwarp.cpp +++ b/modules/imgproc/src/imgwarp.cpp @@ -1720,6 +1720,16 @@ void cv::remap( InputArray _src, OutputArray _dst, CALL_HAL(remap32f, cv_hal_remap32f, src.type(), src.data, src.step, src.cols, src.rows, dst.data, dst.step, dst.cols, dst.rows, map1.ptr(), map1.step, map2.ptr(), map2.step, interpolation, borderType, borderValue.val); } + if ((map1.type() == CV_32FC2) && map2.empty()) + { + CALL_HAL(remap32fc2, cv_hal_remap32fc2, src.type(), src.data, src.step, src.cols, src.rows, dst.data, dst.step, dst.cols, dst.rows, + map1.ptr(), map1.step, interpolation, borderType, borderValue.val); + } + if ((map1.type() == CV_16SC2) && (map2.empty() || map2.type() == CV_16UC1)) + { + CALL_HAL(remap16s, cv_hal_remap16s, src.type(), src.data, src.step, src.cols, src.rows, dst.data, dst.step, dst.cols, dst.rows, + map1.ptr(), map1.step, map2.ptr(), map2.step, interpolation, borderType, borderValue.val); + } interpolation &= ~WARP_RELATIVE_MAP; if( interpolation == INTER_AREA )