Merge pull request #27015 from GenshinImpactStarts:sqrt

[HAL RVV] impl sqrt and invSqrt #27015

Implement through the existing interfaces `cv_hal_sqrt32f`, `cv_hal_sqrt64f`, `cv_hal_invSqrt32f`, `cv_hal_invSqrt64f`.

Perf test done on MUSE-PI and CanMV K230. Because the performance of scalar is much worse than universal intrinsic, only ui and hal rvv is compared.

In RVV's UI, `invSqrt` is computed using `1 / sqrt()`. This patch first uses `frsqrt` and then applies the Newton-Raphson method to achieve higher precision. For the initial value, I tried using the famous [fast inverse square root algorithm](https://en.wikipedia.org/wiki/Fast_inverse_square_root), which involves one bit shift and one subtraction. However, on both MUSE-PI and CanMV K230, the performance was slightly lower (about 3%), so I chose to use `frsqrt` for the initial value instead. 

BTW, I think this patch can directly replace RVV's UI.

**UPDATE**: Due to strange vector registers allocation strategy in clang, for `invSqrt`, clang use LMUL m4 while gcc use LMUL m8, which leads to some performance loss in clang. So the test for clang is appended.

```sh
$ opencv_test_core --gtest_filter="Core_HAL/mathfuncs.*"
$ opencv_perf_core --gtest_filter="SqrtFixture.*" --perf_min_samples=300 --perf_force_samples=300
```

CanMV K230:
```
              Name of Test                 ui    rvv      rvv    
                                                           vs    
                                                           ui    
                                                       (x-factor)
Sqrt::SqrtFixture::(127x61, 5, false)    0.052  0.027     1.96   
Sqrt::SqrtFixture::(127x61, 5, true)     0.101  0.026     3.80   
Sqrt::SqrtFixture::(127x61, 6, false)    0.106  0.059     1.79   
Sqrt::SqrtFixture::(127x61, 6, true)     0.207  0.058     3.55   
Sqrt::SqrtFixture::(640x480, 5, false)   1.988  0.956     2.08   
Sqrt::SqrtFixture::(640x480, 5, true)    3.920  0.948     4.13   
Sqrt::SqrtFixture::(640x480, 6, false)   4.179  2.342     1.78   
Sqrt::SqrtFixture::(640x480, 6, true)    8.220  2.290     3.59   
Sqrt::SqrtFixture::(1280x720, 5, false)  5.969  2.881     2.07   
Sqrt::SqrtFixture::(1280x720, 5, true)   11.731 2.857     4.11   
Sqrt::SqrtFixture::(1280x720, 6, false)  12.533 7.031     1.78   
Sqrt::SqrtFixture::(1280x720, 6, true)   24.643 6.917     3.56   
Sqrt::SqrtFixture::(1920x1080, 5, false) 13.423 6.483     2.07   
Sqrt::SqrtFixture::(1920x1080, 5, true)  26.379 6.436     4.10   
Sqrt::SqrtFixture::(1920x1080, 6, false) 28.200 15.833    1.78   
Sqrt::SqrtFixture::(1920x1080, 6, true)  55.434 15.565    3.56   
```

MUSE-PI:
```
                                                 GCC              |        clang            
              Name of Test                 ui    rvv      rvv     |   ui    rvv      rvv    
                                                           vs     |                   vs    
                                                           ui     |                   ui    
                                                       (x-factor) |               (x-factor)
Sqrt::SqrtFixture::(127x61, 5, false)    0.027  0.018     1.46    | 0.027  0.016     1.65   
Sqrt::SqrtFixture::(127x61, 5, true)     0.050  0.017     2.98    | 0.050  0.017     2.99   
Sqrt::SqrtFixture::(127x61, 6, false)    0.053  0.031     1.72    | 0.052  0.032     1.64   
Sqrt::SqrtFixture::(127x61, 6, true)     0.100  0.030     3.31    | 0.101  0.035     2.86   
Sqrt::SqrtFixture::(640x480, 5, false)   0.955  0.483     1.98    | 0.959  0.499     1.92   
Sqrt::SqrtFixture::(640x480, 5, true)    1.873  0.489     3.83    | 1.873  0.520     3.60   
Sqrt::SqrtFixture::(640x480, 6, false)   2.027  1.163     1.74    | 2.037  1.218     1.67   
Sqrt::SqrtFixture::(640x480, 6, true)    3.961  1.153     3.44    | 3.961  1.341     2.95   
Sqrt::SqrtFixture::(1280x720, 5, false)  2.916  1.538     1.90    | 2.912  1.598     1.82   
Sqrt::SqrtFixture::(1280x720, 5, true)   5.735  1.534     3.74    | 5.726  1.661     3.45   
Sqrt::SqrtFixture::(1280x720, 6, false)  6.121  3.585     1.71    | 6.109  3.725     1.64   
Sqrt::SqrtFixture::(1280x720, 6, true)   12.059 3.501     3.44    | 12.053 4.080     2.95   
Sqrt::SqrtFixture::(1920x1080, 5, false) 6.540  3.535     1.85    | 6.540  3.643     1.80   
Sqrt::SqrtFixture::(1920x1080, 5, true)  12.943 3.445     3.76    | 12.908 3.706     3.48   
Sqrt::SqrtFixture::(1920x1080, 6, false) 13.714 8.062     1.70    | 13.711 8.376     1.64   
Sqrt::SqrtFixture::(1920x1080, 6, true)  27.011 7.989     3.38    | 27.115 9.245     2.93   
```

### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
This commit is contained in:
GenshinImpactStarts 2025-03-12 13:34:27 +08:00 committed by GitHub
parent 656038346b
commit 60de3ff24f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 123 additions and 0 deletions

View File

@ -38,6 +38,7 @@
#include "hal_rvv_1p0/cholesky.hpp" // core
#include "hal_rvv_1p0/qr.hpp" // core
#include "hal_rvv_1p0/svd.hpp" // core
#include "hal_rvv_1p0/sqrt.hpp" // core
#include "hal_rvv_1p0/filter.hpp" // imgproc
#include "hal_rvv_1p0/pyramids.hpp" // imgproc

122
3rdparty/hal_rvv/hal_rvv_1p0/sqrt.hpp vendored Normal file
View File

@ -0,0 +1,122 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level
// directory of this distribution and at http://opencv.org/license.html.
#ifndef OPENCV_HAL_RVV_SQRT_HPP_INCLUDED
#define OPENCV_HAL_RVV_SQRT_HPP_INCLUDED
#include <riscv_vector.h>
#include <cmath>
#include "hal_rvv_1p0/types.hpp"
namespace cv { namespace cv_hal_rvv {
#undef cv_hal_sqrt32f
#undef cv_hal_sqrt64f
#undef cv_hal_invSqrt32f
#undef cv_hal_invSqrt64f
#define cv_hal_sqrt32f cv::cv_hal_rvv::sqrt<cv::cv_hal_rvv::Sqrt32f<cv::cv_hal_rvv::RVV_F32M8>>
#define cv_hal_sqrt64f cv::cv_hal_rvv::sqrt<cv::cv_hal_rvv::Sqrt64f<cv::cv_hal_rvv::RVV_F64M8>>
#ifdef __clang__
// Strange bug in clang: invSqrt use 2 LMUL registers to store mask, which will cause memory access.
// So a smaller LMUL is used here.
# define cv_hal_invSqrt32f cv::cv_hal_rvv::invSqrt<cv::cv_hal_rvv::Sqrt32f<cv::cv_hal_rvv::RVV_F32M4>>
# define cv_hal_invSqrt64f cv::cv_hal_rvv::invSqrt<cv::cv_hal_rvv::Sqrt64f<cv::cv_hal_rvv::RVV_F64M4>>
#else
# define cv_hal_invSqrt32f cv::cv_hal_rvv::invSqrt<cv::cv_hal_rvv::Sqrt32f<cv::cv_hal_rvv::RVV_F32M8>>
# define cv_hal_invSqrt64f cv::cv_hal_rvv::invSqrt<cv::cv_hal_rvv::Sqrt64f<cv::cv_hal_rvv::RVV_F64M8>>
#endif
namespace detail {
// Newton-Raphson method
// Use 4 LMUL registers
template <size_t iter_times, typename VEC_T>
inline VEC_T sqrt(VEC_T x, size_t vl)
{
auto x2 = __riscv_vfmul(x, 0.5, vl);
auto y = __riscv_vfrsqrt7(x, vl);
#pragma unroll
for (size_t i = 0; i < iter_times; i++)
{
auto t = __riscv_vfmul(y, y, vl);
t = __riscv_vfmul(t, x2, vl);
t = __riscv_vfrsub(t, 1.5, vl);
y = __riscv_vfmul(t, y, vl);
}
// just to prevent the compiler from calculating mask before the invSqrt, which will run out
// of registers and cause memory access.
asm volatile("" ::: "memory");
auto mask = __riscv_vmfne(x, 0.0, vl);
mask = __riscv_vmfne_mu(mask, mask, x, INFINITY, vl);
return __riscv_vfmul_mu(mask, x, x, y, vl);
}
// Newton-Raphson method
// Use 3 LMUL registers and 1 mask register
template <size_t iter_times, typename VEC_T>
inline VEC_T invSqrt(VEC_T x, size_t vl)
{
auto mask = __riscv_vmfne(x, 0.0, vl);
mask = __riscv_vmfne_mu(mask, mask, x, INFINITY, vl);
auto x2 = __riscv_vfmul(x, 0.5, vl);
auto y = __riscv_vfrsqrt7(x, vl);
#pragma unroll
for (size_t i = 0; i < iter_times; i++)
{
auto t = __riscv_vfmul(y, y, vl);
t = __riscv_vfmul(t, x2, vl);
t = __riscv_vfrsub(t, 1.5, vl);
y = __riscv_vfmul_mu(mask, y, t, y, vl);
}
return y;
}
} // namespace detail
template <typename RVV_T>
struct Sqrt32f
{
using T = RVV_T;
static constexpr size_t iter_times = 2;
};
template <typename RVV_T>
struct Sqrt64f
{
using T = RVV_T;
static constexpr size_t iter_times = 3;
};
template <typename SQRT_T, typename Elem = typename SQRT_T::T::ElemType>
inline int sqrt(const Elem* src, Elem* dst, int _len)
{
size_t vl;
for (size_t len = _len; len > 0; len -= vl, src += vl, dst += vl)
{
vl = SQRT_T::T::setvl(len);
auto x = SQRT_T::T::vload(src, vl);
SQRT_T::T::vstore(dst, detail::sqrt<SQRT_T::iter_times>(x, vl), vl);
}
return CV_HAL_ERROR_OK;
}
template <typename SQRT_T, typename Elem = typename SQRT_T::T::ElemType>
inline int invSqrt(const Elem* src, Elem* dst, int _len)
{
size_t vl;
for (size_t len = _len; len > 0; len -= vl, src += vl, dst += vl)
{
vl = SQRT_T::T::setvl(len);
auto x = SQRT_T::T::vload(src, vl);
SQRT_T::T::vstore(dst, detail::invSqrt<SQRT_T::iter_times>(x, vl), vl);
}
return CV_HAL_ERROR_OK;
}
}} // namespace cv::cv_hal_rvv
#endif // OPENCV_HAL_RVV_SQRT_HPP_INCLUDED