mirror of
https://github.com/opencv/opencv.git
synced 2025-06-22 19:51:26 +08:00
Merge pull request #27015 from GenshinImpactStarts:sqrt
[HAL RVV] impl sqrt and invSqrt #27015 Implement through the existing interfaces `cv_hal_sqrt32f`, `cv_hal_sqrt64f`, `cv_hal_invSqrt32f`, `cv_hal_invSqrt64f`. Perf test done on MUSE-PI and CanMV K230. Because the performance of scalar is much worse than universal intrinsic, only ui and hal rvv is compared. In RVV's UI, `invSqrt` is computed using `1 / sqrt()`. This patch first uses `frsqrt` and then applies the Newton-Raphson method to achieve higher precision. For the initial value, I tried using the famous [fast inverse square root algorithm](https://en.wikipedia.org/wiki/Fast_inverse_square_root), which involves one bit shift and one subtraction. However, on both MUSE-PI and CanMV K230, the performance was slightly lower (about 3%), so I chose to use `frsqrt` for the initial value instead. BTW, I think this patch can directly replace RVV's UI. **UPDATE**: Due to strange vector registers allocation strategy in clang, for `invSqrt`, clang use LMUL m4 while gcc use LMUL m8, which leads to some performance loss in clang. So the test for clang is appended. ```sh $ opencv_test_core --gtest_filter="Core_HAL/mathfuncs.*" $ opencv_perf_core --gtest_filter="SqrtFixture.*" --perf_min_samples=300 --perf_force_samples=300 ``` CanMV K230: ``` Name of Test ui rvv rvv vs ui (x-factor) Sqrt::SqrtFixture::(127x61, 5, false) 0.052 0.027 1.96 Sqrt::SqrtFixture::(127x61, 5, true) 0.101 0.026 3.80 Sqrt::SqrtFixture::(127x61, 6, false) 0.106 0.059 1.79 Sqrt::SqrtFixture::(127x61, 6, true) 0.207 0.058 3.55 Sqrt::SqrtFixture::(640x480, 5, false) 1.988 0.956 2.08 Sqrt::SqrtFixture::(640x480, 5, true) 3.920 0.948 4.13 Sqrt::SqrtFixture::(640x480, 6, false) 4.179 2.342 1.78 Sqrt::SqrtFixture::(640x480, 6, true) 8.220 2.290 3.59 Sqrt::SqrtFixture::(1280x720, 5, false) 5.969 2.881 2.07 Sqrt::SqrtFixture::(1280x720, 5, true) 11.731 2.857 4.11 Sqrt::SqrtFixture::(1280x720, 6, false) 12.533 7.031 1.78 Sqrt::SqrtFixture::(1280x720, 6, true) 24.643 6.917 3.56 Sqrt::SqrtFixture::(1920x1080, 5, false) 13.423 6.483 2.07 Sqrt::SqrtFixture::(1920x1080, 5, true) 26.379 6.436 4.10 Sqrt::SqrtFixture::(1920x1080, 6, false) 28.200 15.833 1.78 Sqrt::SqrtFixture::(1920x1080, 6, true) 55.434 15.565 3.56 ``` MUSE-PI: ``` GCC | clang Name of Test ui rvv rvv | ui rvv rvv vs | vs ui | ui (x-factor) | (x-factor) Sqrt::SqrtFixture::(127x61, 5, false) 0.027 0.018 1.46 | 0.027 0.016 1.65 Sqrt::SqrtFixture::(127x61, 5, true) 0.050 0.017 2.98 | 0.050 0.017 2.99 Sqrt::SqrtFixture::(127x61, 6, false) 0.053 0.031 1.72 | 0.052 0.032 1.64 Sqrt::SqrtFixture::(127x61, 6, true) 0.100 0.030 3.31 | 0.101 0.035 2.86 Sqrt::SqrtFixture::(640x480, 5, false) 0.955 0.483 1.98 | 0.959 0.499 1.92 Sqrt::SqrtFixture::(640x480, 5, true) 1.873 0.489 3.83 | 1.873 0.520 3.60 Sqrt::SqrtFixture::(640x480, 6, false) 2.027 1.163 1.74 | 2.037 1.218 1.67 Sqrt::SqrtFixture::(640x480, 6, true) 3.961 1.153 3.44 | 3.961 1.341 2.95 Sqrt::SqrtFixture::(1280x720, 5, false) 2.916 1.538 1.90 | 2.912 1.598 1.82 Sqrt::SqrtFixture::(1280x720, 5, true) 5.735 1.534 3.74 | 5.726 1.661 3.45 Sqrt::SqrtFixture::(1280x720, 6, false) 6.121 3.585 1.71 | 6.109 3.725 1.64 Sqrt::SqrtFixture::(1280x720, 6, true) 12.059 3.501 3.44 | 12.053 4.080 2.95 Sqrt::SqrtFixture::(1920x1080, 5, false) 6.540 3.535 1.85 | 6.540 3.643 1.80 Sqrt::SqrtFixture::(1920x1080, 5, true) 12.943 3.445 3.76 | 12.908 3.706 3.48 Sqrt::SqrtFixture::(1920x1080, 6, false) 13.714 8.062 1.70 | 13.711 8.376 1.64 Sqrt::SqrtFixture::(1920x1080, 6, true) 27.011 7.989 3.38 | 27.115 9.245 2.93 ``` ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [ ] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake
This commit is contained in:
parent
656038346b
commit
60de3ff24f
1
3rdparty/hal_rvv/hal_rvv.hpp
vendored
1
3rdparty/hal_rvv/hal_rvv.hpp
vendored
@ -38,6 +38,7 @@
|
||||
#include "hal_rvv_1p0/cholesky.hpp" // core
|
||||
#include "hal_rvv_1p0/qr.hpp" // core
|
||||
#include "hal_rvv_1p0/svd.hpp" // core
|
||||
#include "hal_rvv_1p0/sqrt.hpp" // core
|
||||
|
||||
#include "hal_rvv_1p0/filter.hpp" // imgproc
|
||||
#include "hal_rvv_1p0/pyramids.hpp" // imgproc
|
||||
|
122
3rdparty/hal_rvv/hal_rvv_1p0/sqrt.hpp
vendored
Normal file
122
3rdparty/hal_rvv/hal_rvv_1p0/sqrt.hpp
vendored
Normal file
@ -0,0 +1,122 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level
|
||||
// directory of this distribution and at http://opencv.org/license.html.
|
||||
#ifndef OPENCV_HAL_RVV_SQRT_HPP_INCLUDED
|
||||
#define OPENCV_HAL_RVV_SQRT_HPP_INCLUDED
|
||||
|
||||
#include <riscv_vector.h>
|
||||
#include <cmath>
|
||||
#include "hal_rvv_1p0/types.hpp"
|
||||
|
||||
namespace cv { namespace cv_hal_rvv {
|
||||
|
||||
#undef cv_hal_sqrt32f
|
||||
#undef cv_hal_sqrt64f
|
||||
#undef cv_hal_invSqrt32f
|
||||
#undef cv_hal_invSqrt64f
|
||||
|
||||
#define cv_hal_sqrt32f cv::cv_hal_rvv::sqrt<cv::cv_hal_rvv::Sqrt32f<cv::cv_hal_rvv::RVV_F32M8>>
|
||||
#define cv_hal_sqrt64f cv::cv_hal_rvv::sqrt<cv::cv_hal_rvv::Sqrt64f<cv::cv_hal_rvv::RVV_F64M8>>
|
||||
|
||||
#ifdef __clang__
|
||||
// Strange bug in clang: invSqrt use 2 LMUL registers to store mask, which will cause memory access.
|
||||
// So a smaller LMUL is used here.
|
||||
# define cv_hal_invSqrt32f cv::cv_hal_rvv::invSqrt<cv::cv_hal_rvv::Sqrt32f<cv::cv_hal_rvv::RVV_F32M4>>
|
||||
# define cv_hal_invSqrt64f cv::cv_hal_rvv::invSqrt<cv::cv_hal_rvv::Sqrt64f<cv::cv_hal_rvv::RVV_F64M4>>
|
||||
#else
|
||||
# define cv_hal_invSqrt32f cv::cv_hal_rvv::invSqrt<cv::cv_hal_rvv::Sqrt32f<cv::cv_hal_rvv::RVV_F32M8>>
|
||||
# define cv_hal_invSqrt64f cv::cv_hal_rvv::invSqrt<cv::cv_hal_rvv::Sqrt64f<cv::cv_hal_rvv::RVV_F64M8>>
|
||||
#endif
|
||||
|
||||
namespace detail {
|
||||
|
||||
// Newton-Raphson method
|
||||
// Use 4 LMUL registers
|
||||
template <size_t iter_times, typename VEC_T>
|
||||
inline VEC_T sqrt(VEC_T x, size_t vl)
|
||||
{
|
||||
auto x2 = __riscv_vfmul(x, 0.5, vl);
|
||||
auto y = __riscv_vfrsqrt7(x, vl);
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < iter_times; i++)
|
||||
{
|
||||
auto t = __riscv_vfmul(y, y, vl);
|
||||
t = __riscv_vfmul(t, x2, vl);
|
||||
t = __riscv_vfrsub(t, 1.5, vl);
|
||||
y = __riscv_vfmul(t, y, vl);
|
||||
}
|
||||
// just to prevent the compiler from calculating mask before the invSqrt, which will run out
|
||||
// of registers and cause memory access.
|
||||
asm volatile("" ::: "memory");
|
||||
auto mask = __riscv_vmfne(x, 0.0, vl);
|
||||
mask = __riscv_vmfne_mu(mask, mask, x, INFINITY, vl);
|
||||
return __riscv_vfmul_mu(mask, x, x, y, vl);
|
||||
}
|
||||
|
||||
// Newton-Raphson method
|
||||
// Use 3 LMUL registers and 1 mask register
|
||||
template <size_t iter_times, typename VEC_T>
|
||||
inline VEC_T invSqrt(VEC_T x, size_t vl)
|
||||
{
|
||||
auto mask = __riscv_vmfne(x, 0.0, vl);
|
||||
mask = __riscv_vmfne_mu(mask, mask, x, INFINITY, vl);
|
||||
auto x2 = __riscv_vfmul(x, 0.5, vl);
|
||||
auto y = __riscv_vfrsqrt7(x, vl);
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < iter_times; i++)
|
||||
{
|
||||
auto t = __riscv_vfmul(y, y, vl);
|
||||
t = __riscv_vfmul(t, x2, vl);
|
||||
t = __riscv_vfrsub(t, 1.5, vl);
|
||||
y = __riscv_vfmul_mu(mask, y, t, y, vl);
|
||||
}
|
||||
return y;
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
|
||||
template <typename RVV_T>
|
||||
struct Sqrt32f
|
||||
{
|
||||
using T = RVV_T;
|
||||
static constexpr size_t iter_times = 2;
|
||||
};
|
||||
|
||||
template <typename RVV_T>
|
||||
struct Sqrt64f
|
||||
{
|
||||
using T = RVV_T;
|
||||
static constexpr size_t iter_times = 3;
|
||||
};
|
||||
|
||||
template <typename SQRT_T, typename Elem = typename SQRT_T::T::ElemType>
|
||||
inline int sqrt(const Elem* src, Elem* dst, int _len)
|
||||
{
|
||||
size_t vl;
|
||||
for (size_t len = _len; len > 0; len -= vl, src += vl, dst += vl)
|
||||
{
|
||||
vl = SQRT_T::T::setvl(len);
|
||||
auto x = SQRT_T::T::vload(src, vl);
|
||||
SQRT_T::T::vstore(dst, detail::sqrt<SQRT_T::iter_times>(x, vl), vl);
|
||||
}
|
||||
|
||||
return CV_HAL_ERROR_OK;
|
||||
}
|
||||
|
||||
template <typename SQRT_T, typename Elem = typename SQRT_T::T::ElemType>
|
||||
inline int invSqrt(const Elem* src, Elem* dst, int _len)
|
||||
{
|
||||
size_t vl;
|
||||
for (size_t len = _len; len > 0; len -= vl, src += vl, dst += vl)
|
||||
{
|
||||
vl = SQRT_T::T::setvl(len);
|
||||
auto x = SQRT_T::T::vload(src, vl);
|
||||
SQRT_T::T::vstore(dst, detail::invSqrt<SQRT_T::iter_times>(x, vl), vl);
|
||||
}
|
||||
|
||||
return CV_HAL_ERROR_OK;
|
||||
}
|
||||
|
||||
}} // namespace cv::cv_hal_rvv
|
||||
|
||||
#endif // OPENCV_HAL_RVV_SQRT_HPP_INCLUDED
|
Loading…
Reference in New Issue
Block a user