Merge pull request #27015 from GenshinImpactStarts:sqrt

[HAL RVV] impl sqrt and invSqrt #27015 Implement through the existing interfaces `cv_hal_sqrt32f`, `cv_hal_sqrt64f`, `cv_hal_invSqrt32f`, `cv_hal_invSqrt64f`. Perf test done on MUSE-PI and CanMV K230. Because the performance of scalar is much worse than universal intrinsic, only ui and hal rvv is compared. In RVV's UI, `invSqrt` is computed using `1 / sqrt()`. This patch first uses `frsqrt` and then applies the Newton-Raphson method to achieve higher precision. For the initial value, I tried using the famous [fast inverse square root algorithm](https://en.wikipedia.org/wiki/Fast_inverse_square_root), which involves one bit shift and one subtraction. However, on both MUSE-PI and CanMV K230, the performance was slightly lower (about 3%), so I chose to use `frsqrt` for the initial value instead. BTW, I think this patch can directly replace RVV's UI. **UPDATE**: Due to strange vector registers allocation strategy in clang, for `invSqrt`, clang use LMUL m4 while gcc use LMUL m8, which leads to some performance loss in clang. So the test for clang is appended. ```sh $ opencv_test_core --gtest_filter="Core_HAL/mathfuncs.*" $ opencv_perf_core --gtest_filter="SqrtFixture.*" --perf_min_samples=300 --perf_force_samples=300 ``` CanMV K230: ``` Name of Test ui rvv rvv vs ui (x-factor) Sqrt::SqrtFixture::(127x61, 5, false) 0.052 0.027 1.96 Sqrt::SqrtFixture::(127x61, 5, true) 0.101 0.026 3.80 Sqrt::SqrtFixture::(127x61, 6, false) 0.106 0.059 1.79 Sqrt::SqrtFixture::(127x61, 6, true) 0.207 0.058 3.55 Sqrt::SqrtFixture::(640x480, 5, false) 1.988 0.956 2.08 Sqrt::SqrtFixture::(640x480, 5, true) 3.920 0.948 4.13 Sqrt::SqrtFixture::(640x480, 6, false) 4.179 2.342 1.78 Sqrt::SqrtFixture::(640x480, 6, true) 8.220 2.290 3.59 Sqrt::SqrtFixture::(1280x720, 5, false) 5.969 2.881 2.07 Sqrt::SqrtFixture::(1280x720, 5, true) 11.731 2.857 4.11 Sqrt::SqrtFixture::(1280x720, 6, false) 12.533 7.031 1.78 Sqrt::SqrtFixture::(1280x720, 6, true) 24.643 6.917 3.56 Sqrt::SqrtFixture::(1920x1080, 5, false) 13.423 6.483 2.07 Sqrt::SqrtFixture::(1920x1080, 5, true) 26.379 6.436 4.10 Sqrt::SqrtFixture::(1920x1080, 6, false) 28.200 15.833 1.78 Sqrt::SqrtFixture::(1920x1080, 6, true) 55.434 15.565 3.56 ``` MUSE-PI: ``` GCC | clang Name of Test ui rvv rvv | ui rvv rvv vs | vs ui | ui (x-factor) | (x-factor) Sqrt::SqrtFixture::(127x61, 5, false) 0.027 0.018 1.46 | 0.027 0.016 1.65 Sqrt::SqrtFixture::(127x61, 5, true) 0.050 0.017 2.98 | 0.050 0.017 2.99 Sqrt::SqrtFixture::(127x61, 6, false) 0.053 0.031 1.72 | 0.052 0.032 1.64 Sqrt::SqrtFixture::(127x61, 6, true) 0.100 0.030 3.31 | 0.101 0.035 2.86 Sqrt::SqrtFixture::(640x480, 5, false) 0.955 0.483 1.98 | 0.959 0.499 1.92 Sqrt::SqrtFixture::(640x480, 5, true) 1.873 0.489 3.83 | 1.873 0.520 3.60 Sqrt::SqrtFixture::(640x480, 6, false) 2.027 1.163 1.74 | 2.037 1.218 1.67 Sqrt::SqrtFixture::(640x480, 6, true) 3.961 1.153 3.44 | 3.961 1.341 2.95 Sqrt::SqrtFixture::(1280x720, 5, false) 2.916 1.538 1.90 | 2.912 1.598 1.82 Sqrt::SqrtFixture::(1280x720, 5, true) 5.735 1.534 3.74 | 5.726 1.661 3.45 Sqrt::SqrtFixture::(1280x720, 6, false) 6.121 3.585 1.71 | 6.109 3.725 1.64 Sqrt::SqrtFixture::(1280x720, 6, true) 12.059 3.501 3.44 | 12.053 4.080 2.95 Sqrt::SqrtFixture::(1920x1080, 5, false) 6.540 3.535 1.85 | 6.540 3.643 1.80 Sqrt::SqrtFixture::(1920x1080, 5, true) 12.943 3.445 3.76 | 12.908 3.706 3.48 Sqrt::SqrtFixture::(1920x1080, 6, false) 13.714 8.062 1.70 | 13.711 8.376 1.64 Sqrt::SqrtFixture::(1920x1080, 6, true) 27.011 7.989 3.38 | 27.115 9.245 2.93 ``` ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [ ] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake
2025-06-23 04:01:31 +08:00 · 2025-03-12 13:34:27 +08:00 · 2025-03-12 13:34:27 +08:00 · 60de3ff24f
commit 60de3ff24f
parent 656038346b
2 changed files with 123 additions and 0 deletions
--- a/3rdparty/hal_rvv/hal_rvv.hpp
+++ b/3rdparty/hal_rvv/hal_rvv.hpp
@ -38,6 +38,7 @@
 #include "hal_rvv_1p0/cholesky.hpp" // core
 #include "hal_rvv_1p0/qr.hpp" // core
 #include "hal_rvv_1p0/svd.hpp" // core
+#include "hal_rvv_1p0/sqrt.hpp" // core

 #include "hal_rvv_1p0/filter.hpp" // imgproc
 #include "hal_rvv_1p0/pyramids.hpp" // imgproc
--- a/3rdparty/hal_rvv/hal_rvv_1p0/sqrt.hpp
+++ b/3rdparty/hal_rvv/hal_rvv_1p0/sqrt.hpp
@ -0,0 +1,122 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level
+// directory of this distribution and at http://opencv.org/license.html.
+#ifndef OPENCV_HAL_RVV_SQRT_HPP_INCLUDED
+#define OPENCV_HAL_RVV_SQRT_HPP_INCLUDED
+
+#include <riscv_vector.h>
+#include <cmath>
+#include "hal_rvv_1p0/types.hpp"
+
+namespace cv { namespace cv_hal_rvv {
+
+#undef cv_hal_sqrt32f
+#undef cv_hal_sqrt64f
+#undef cv_hal_invSqrt32f
+#undef cv_hal_invSqrt64f
+
+#define cv_hal_sqrt32f cv::cv_hal_rvv::sqrt<cv::cv_hal_rvv::Sqrt32f<cv::cv_hal_rvv::RVV_F32M8>>
+#define cv_hal_sqrt64f cv::cv_hal_rvv::sqrt<cv::cv_hal_rvv::Sqrt64f<cv::cv_hal_rvv::RVV_F64M8>>
+
+#ifdef __clang__
+// Strange bug in clang: invSqrt use 2 LMUL registers to store mask, which will cause memory access.
+// So a smaller LMUL is used here.
+#    define cv_hal_invSqrt32f cv::cv_hal_rvv::invSqrt<cv::cv_hal_rvv::Sqrt32f<cv::cv_hal_rvv::RVV_F32M4>>
+#    define cv_hal_invSqrt64f cv::cv_hal_rvv::invSqrt<cv::cv_hal_rvv::Sqrt64f<cv::cv_hal_rvv::RVV_F64M4>>
+#else
+#    define cv_hal_invSqrt32f cv::cv_hal_rvv::invSqrt<cv::cv_hal_rvv::Sqrt32f<cv::cv_hal_rvv::RVV_F32M8>>
+#    define cv_hal_invSqrt64f cv::cv_hal_rvv::invSqrt<cv::cv_hal_rvv::Sqrt64f<cv::cv_hal_rvv::RVV_F64M8>>
+#endif
+
+namespace detail {
+
+// Newton-Raphson method
+// Use 4 LMUL registers
+template <size_t iter_times, typename VEC_T>
+inline VEC_T sqrt(VEC_T x, size_t vl)
+{
+    auto x2 = __riscv_vfmul(x, 0.5, vl);
+    auto y = __riscv_vfrsqrt7(x, vl);
+#pragma unroll
+    for (size_t i = 0; i < iter_times; i++)
+    {
+        auto t = __riscv_vfmul(y, y, vl);
+        t = __riscv_vfmul(t, x2, vl);
+        t = __riscv_vfrsub(t, 1.5, vl);
+        y = __riscv_vfmul(t, y, vl);
+    }
+    // just to prevent the compiler from calculating mask before the invSqrt, which will run out
+    // of registers and cause memory access.
+    asm volatile("" ::: "memory");
+    auto mask = __riscv_vmfne(x, 0.0, vl);
+    mask = __riscv_vmfne_mu(mask, mask, x, INFINITY, vl);
+    return __riscv_vfmul_mu(mask, x, x, y, vl);
+}
+
+// Newton-Raphson method
+// Use 3 LMUL registers and 1 mask register
+template <size_t iter_times, typename VEC_T>
+inline VEC_T invSqrt(VEC_T x, size_t vl)
+{
+    auto mask = __riscv_vmfne(x, 0.0, vl);
+    mask = __riscv_vmfne_mu(mask, mask, x, INFINITY, vl);
+    auto x2 = __riscv_vfmul(x, 0.5, vl);
+    auto y = __riscv_vfrsqrt7(x, vl);
+#pragma unroll
+    for (size_t i = 0; i < iter_times; i++)
+    {
+        auto t = __riscv_vfmul(y, y, vl);
+        t = __riscv_vfmul(t, x2, vl);
+        t = __riscv_vfrsub(t, 1.5, vl);
+        y = __riscv_vfmul_mu(mask, y, t, y, vl);
+    }
+    return y;
+}
+
+}  // namespace detail
+
+template <typename RVV_T>
+struct Sqrt32f
+{
+    using T = RVV_T;
+    static constexpr size_t iter_times = 2;
+};
+
+template <typename RVV_T>
+struct Sqrt64f
+{
+    using T = RVV_T;
+    static constexpr size_t iter_times = 3;
+};
+
+template <typename SQRT_T, typename Elem = typename SQRT_T::T::ElemType>
+inline int sqrt(const Elem* src, Elem* dst, int _len)
+{
+    size_t vl;
+    for (size_t len = _len; len > 0; len -= vl, src += vl, dst += vl)
+    {
+        vl = SQRT_T::T::setvl(len);
+        auto x = SQRT_T::T::vload(src, vl);
+        SQRT_T::T::vstore(dst, detail::sqrt<SQRT_T::iter_times>(x, vl), vl);
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+template <typename SQRT_T, typename Elem = typename SQRT_T::T::ElemType>
+inline int invSqrt(const Elem* src, Elem* dst, int _len)
+{
+    size_t vl;
+    for (size_t len = _len; len > 0; len -= vl, src += vl, dst += vl)
+    {
+        vl = SQRT_T::T::setvl(len);
+        auto x = SQRT_T::T::vload(src, vl);
+        SQRT_T::T::vstore(dst, detail::invSqrt<SQRT_T::iter_times>(x, vl), vl);
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+}}  // namespace cv::cv_hal_rvv
+
+#endif  // OPENCV_HAL_RVV_SQRT_HPP_INCLUDED