From 60de3ff24f88b840fe95eab24e374d6e97ee76ee Mon Sep 17 00:00:00 2001 From: GenshinImpactStarts <147074368+GenshinImpactStarts@users.noreply.github.com> Date: Wed, 12 Mar 2025 13:34:27 +0800 Subject: [PATCH] Merge pull request #27015 from GenshinImpactStarts:sqrt [HAL RVV] impl sqrt and invSqrt #27015 Implement through the existing interfaces `cv_hal_sqrt32f`, `cv_hal_sqrt64f`, `cv_hal_invSqrt32f`, `cv_hal_invSqrt64f`. Perf test done on MUSE-PI and CanMV K230. Because the performance of scalar is much worse than universal intrinsic, only ui and hal rvv is compared. In RVV's UI, `invSqrt` is computed using `1 / sqrt()`. This patch first uses `frsqrt` and then applies the Newton-Raphson method to achieve higher precision. For the initial value, I tried using the famous [fast inverse square root algorithm](https://en.wikipedia.org/wiki/Fast_inverse_square_root), which involves one bit shift and one subtraction. However, on both MUSE-PI and CanMV K230, the performance was slightly lower (about 3%), so I chose to use `frsqrt` for the initial value instead. BTW, I think this patch can directly replace RVV's UI. **UPDATE**: Due to strange vector registers allocation strategy in clang, for `invSqrt`, clang use LMUL m4 while gcc use LMUL m8, which leads to some performance loss in clang. So the test for clang is appended. ```sh $ opencv_test_core --gtest_filter="Core_HAL/mathfuncs.*" $ opencv_perf_core --gtest_filter="SqrtFixture.*" --perf_min_samples=300 --perf_force_samples=300 ``` CanMV K230: ``` Name of Test ui rvv rvv vs ui (x-factor) Sqrt::SqrtFixture::(127x61, 5, false) 0.052 0.027 1.96 Sqrt::SqrtFixture::(127x61, 5, true) 0.101 0.026 3.80 Sqrt::SqrtFixture::(127x61, 6, false) 0.106 0.059 1.79 Sqrt::SqrtFixture::(127x61, 6, true) 0.207 0.058 3.55 Sqrt::SqrtFixture::(640x480, 5, false) 1.988 0.956 2.08 Sqrt::SqrtFixture::(640x480, 5, true) 3.920 0.948 4.13 Sqrt::SqrtFixture::(640x480, 6, false) 4.179 2.342 1.78 Sqrt::SqrtFixture::(640x480, 6, true) 8.220 2.290 3.59 Sqrt::SqrtFixture::(1280x720, 5, false) 5.969 2.881 2.07 Sqrt::SqrtFixture::(1280x720, 5, true) 11.731 2.857 4.11 Sqrt::SqrtFixture::(1280x720, 6, false) 12.533 7.031 1.78 Sqrt::SqrtFixture::(1280x720, 6, true) 24.643 6.917 3.56 Sqrt::SqrtFixture::(1920x1080, 5, false) 13.423 6.483 2.07 Sqrt::SqrtFixture::(1920x1080, 5, true) 26.379 6.436 4.10 Sqrt::SqrtFixture::(1920x1080, 6, false) 28.200 15.833 1.78 Sqrt::SqrtFixture::(1920x1080, 6, true) 55.434 15.565 3.56 ``` MUSE-PI: ``` GCC | clang Name of Test ui rvv rvv | ui rvv rvv vs | vs ui | ui (x-factor) | (x-factor) Sqrt::SqrtFixture::(127x61, 5, false) 0.027 0.018 1.46 | 0.027 0.016 1.65 Sqrt::SqrtFixture::(127x61, 5, true) 0.050 0.017 2.98 | 0.050 0.017 2.99 Sqrt::SqrtFixture::(127x61, 6, false) 0.053 0.031 1.72 | 0.052 0.032 1.64 Sqrt::SqrtFixture::(127x61, 6, true) 0.100 0.030 3.31 | 0.101 0.035 2.86 Sqrt::SqrtFixture::(640x480, 5, false) 0.955 0.483 1.98 | 0.959 0.499 1.92 Sqrt::SqrtFixture::(640x480, 5, true) 1.873 0.489 3.83 | 1.873 0.520 3.60 Sqrt::SqrtFixture::(640x480, 6, false) 2.027 1.163 1.74 | 2.037 1.218 1.67 Sqrt::SqrtFixture::(640x480, 6, true) 3.961 1.153 3.44 | 3.961 1.341 2.95 Sqrt::SqrtFixture::(1280x720, 5, false) 2.916 1.538 1.90 | 2.912 1.598 1.82 Sqrt::SqrtFixture::(1280x720, 5, true) 5.735 1.534 3.74 | 5.726 1.661 3.45 Sqrt::SqrtFixture::(1280x720, 6, false) 6.121 3.585 1.71 | 6.109 3.725 1.64 Sqrt::SqrtFixture::(1280x720, 6, true) 12.059 3.501 3.44 | 12.053 4.080 2.95 Sqrt::SqrtFixture::(1920x1080, 5, false) 6.540 3.535 1.85 | 6.540 3.643 1.80 Sqrt::SqrtFixture::(1920x1080, 5, true) 12.943 3.445 3.76 | 12.908 3.706 3.48 Sqrt::SqrtFixture::(1920x1080, 6, false) 13.714 8.062 1.70 | 13.711 8.376 1.64 Sqrt::SqrtFixture::(1920x1080, 6, true) 27.011 7.989 3.38 | 27.115 9.245 2.93 ``` ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [ ] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake --- 3rdparty/hal_rvv/hal_rvv.hpp | 1 + 3rdparty/hal_rvv/hal_rvv_1p0/sqrt.hpp | 122 ++++++++++++++++++++++++++ 2 files changed, 123 insertions(+) create mode 100644 3rdparty/hal_rvv/hal_rvv_1p0/sqrt.hpp diff --git a/3rdparty/hal_rvv/hal_rvv.hpp b/3rdparty/hal_rvv/hal_rvv.hpp index a4dd6c2326..57d2ccfee5 100644 --- a/3rdparty/hal_rvv/hal_rvv.hpp +++ b/3rdparty/hal_rvv/hal_rvv.hpp @@ -38,6 +38,7 @@ #include "hal_rvv_1p0/cholesky.hpp" // core #include "hal_rvv_1p0/qr.hpp" // core #include "hal_rvv_1p0/svd.hpp" // core +#include "hal_rvv_1p0/sqrt.hpp" // core #include "hal_rvv_1p0/filter.hpp" // imgproc #include "hal_rvv_1p0/pyramids.hpp" // imgproc diff --git a/3rdparty/hal_rvv/hal_rvv_1p0/sqrt.hpp b/3rdparty/hal_rvv/hal_rvv_1p0/sqrt.hpp new file mode 100644 index 0000000000..9a2e5d6bfe --- /dev/null +++ b/3rdparty/hal_rvv/hal_rvv_1p0/sqrt.hpp @@ -0,0 +1,122 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level +// directory of this distribution and at http://opencv.org/license.html. +#ifndef OPENCV_HAL_RVV_SQRT_HPP_INCLUDED +#define OPENCV_HAL_RVV_SQRT_HPP_INCLUDED + +#include +#include +#include "hal_rvv_1p0/types.hpp" + +namespace cv { namespace cv_hal_rvv { + +#undef cv_hal_sqrt32f +#undef cv_hal_sqrt64f +#undef cv_hal_invSqrt32f +#undef cv_hal_invSqrt64f + +#define cv_hal_sqrt32f cv::cv_hal_rvv::sqrt> +#define cv_hal_sqrt64f cv::cv_hal_rvv::sqrt> + +#ifdef __clang__ +// Strange bug in clang: invSqrt use 2 LMUL registers to store mask, which will cause memory access. +// So a smaller LMUL is used here. +# define cv_hal_invSqrt32f cv::cv_hal_rvv::invSqrt> +# define cv_hal_invSqrt64f cv::cv_hal_rvv::invSqrt> +#else +# define cv_hal_invSqrt32f cv::cv_hal_rvv::invSqrt> +# define cv_hal_invSqrt64f cv::cv_hal_rvv::invSqrt> +#endif + +namespace detail { + +// Newton-Raphson method +// Use 4 LMUL registers +template +inline VEC_T sqrt(VEC_T x, size_t vl) +{ + auto x2 = __riscv_vfmul(x, 0.5, vl); + auto y = __riscv_vfrsqrt7(x, vl); +#pragma unroll + for (size_t i = 0; i < iter_times; i++) + { + auto t = __riscv_vfmul(y, y, vl); + t = __riscv_vfmul(t, x2, vl); + t = __riscv_vfrsub(t, 1.5, vl); + y = __riscv_vfmul(t, y, vl); + } + // just to prevent the compiler from calculating mask before the invSqrt, which will run out + // of registers and cause memory access. + asm volatile("" ::: "memory"); + auto mask = __riscv_vmfne(x, 0.0, vl); + mask = __riscv_vmfne_mu(mask, mask, x, INFINITY, vl); + return __riscv_vfmul_mu(mask, x, x, y, vl); +} + +// Newton-Raphson method +// Use 3 LMUL registers and 1 mask register +template +inline VEC_T invSqrt(VEC_T x, size_t vl) +{ + auto mask = __riscv_vmfne(x, 0.0, vl); + mask = __riscv_vmfne_mu(mask, mask, x, INFINITY, vl); + auto x2 = __riscv_vfmul(x, 0.5, vl); + auto y = __riscv_vfrsqrt7(x, vl); +#pragma unroll + for (size_t i = 0; i < iter_times; i++) + { + auto t = __riscv_vfmul(y, y, vl); + t = __riscv_vfmul(t, x2, vl); + t = __riscv_vfrsub(t, 1.5, vl); + y = __riscv_vfmul_mu(mask, y, t, y, vl); + } + return y; +} + +} // namespace detail + +template +struct Sqrt32f +{ + using T = RVV_T; + static constexpr size_t iter_times = 2; +}; + +template +struct Sqrt64f +{ + using T = RVV_T; + static constexpr size_t iter_times = 3; +}; + +template +inline int sqrt(const Elem* src, Elem* dst, int _len) +{ + size_t vl; + for (size_t len = _len; len > 0; len -= vl, src += vl, dst += vl) + { + vl = SQRT_T::T::setvl(len); + auto x = SQRT_T::T::vload(src, vl); + SQRT_T::T::vstore(dst, detail::sqrt(x, vl), vl); + } + + return CV_HAL_ERROR_OK; +} + +template +inline int invSqrt(const Elem* src, Elem* dst, int _len) +{ + size_t vl; + for (size_t len = _len; len > 0; len -= vl, src += vl, dst += vl) + { + vl = SQRT_T::T::setvl(len); + auto x = SQRT_T::T::vload(src, vl); + SQRT_T::T::vstore(dst, detail::invSqrt(x, vl), vl); + } + + return CV_HAL_ERROR_OK; +} + +}} // namespace cv::cv_hal_rvv + +#endif // OPENCV_HAL_RVV_SQRT_HPP_INCLUDED