diff --git a/3rdparty/hal_rvv/hal_rvv.hpp b/3rdparty/hal_rvv/hal_rvv.hpp index a4dd6c2326..57d2ccfee5 100644 --- a/3rdparty/hal_rvv/hal_rvv.hpp +++ b/3rdparty/hal_rvv/hal_rvv.hpp @@ -38,6 +38,7 @@ #include "hal_rvv_1p0/cholesky.hpp" // core #include "hal_rvv_1p0/qr.hpp" // core #include "hal_rvv_1p0/svd.hpp" // core +#include "hal_rvv_1p0/sqrt.hpp" // core #include "hal_rvv_1p0/filter.hpp" // imgproc #include "hal_rvv_1p0/pyramids.hpp" // imgproc diff --git a/3rdparty/hal_rvv/hal_rvv_1p0/sqrt.hpp b/3rdparty/hal_rvv/hal_rvv_1p0/sqrt.hpp new file mode 100644 index 0000000000..9a2e5d6bfe --- /dev/null +++ b/3rdparty/hal_rvv/hal_rvv_1p0/sqrt.hpp @@ -0,0 +1,122 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level +// directory of this distribution and at http://opencv.org/license.html. +#ifndef OPENCV_HAL_RVV_SQRT_HPP_INCLUDED +#define OPENCV_HAL_RVV_SQRT_HPP_INCLUDED + +#include +#include +#include "hal_rvv_1p0/types.hpp" + +namespace cv { namespace cv_hal_rvv { + +#undef cv_hal_sqrt32f +#undef cv_hal_sqrt64f +#undef cv_hal_invSqrt32f +#undef cv_hal_invSqrt64f + +#define cv_hal_sqrt32f cv::cv_hal_rvv::sqrt> +#define cv_hal_sqrt64f cv::cv_hal_rvv::sqrt> + +#ifdef __clang__ +// Strange bug in clang: invSqrt use 2 LMUL registers to store mask, which will cause memory access. +// So a smaller LMUL is used here. +# define cv_hal_invSqrt32f cv::cv_hal_rvv::invSqrt> +# define cv_hal_invSqrt64f cv::cv_hal_rvv::invSqrt> +#else +# define cv_hal_invSqrt32f cv::cv_hal_rvv::invSqrt> +# define cv_hal_invSqrt64f cv::cv_hal_rvv::invSqrt> +#endif + +namespace detail { + +// Newton-Raphson method +// Use 4 LMUL registers +template +inline VEC_T sqrt(VEC_T x, size_t vl) +{ + auto x2 = __riscv_vfmul(x, 0.5, vl); + auto y = __riscv_vfrsqrt7(x, vl); +#pragma unroll + for (size_t i = 0; i < iter_times; i++) + { + auto t = __riscv_vfmul(y, y, vl); + t = __riscv_vfmul(t, x2, vl); + t = __riscv_vfrsub(t, 1.5, vl); + y = __riscv_vfmul(t, y, vl); + } + // just to prevent the compiler from calculating mask before the invSqrt, which will run out + // of registers and cause memory access. + asm volatile("" ::: "memory"); + auto mask = __riscv_vmfne(x, 0.0, vl); + mask = __riscv_vmfne_mu(mask, mask, x, INFINITY, vl); + return __riscv_vfmul_mu(mask, x, x, y, vl); +} + +// Newton-Raphson method +// Use 3 LMUL registers and 1 mask register +template +inline VEC_T invSqrt(VEC_T x, size_t vl) +{ + auto mask = __riscv_vmfne(x, 0.0, vl); + mask = __riscv_vmfne_mu(mask, mask, x, INFINITY, vl); + auto x2 = __riscv_vfmul(x, 0.5, vl); + auto y = __riscv_vfrsqrt7(x, vl); +#pragma unroll + for (size_t i = 0; i < iter_times; i++) + { + auto t = __riscv_vfmul(y, y, vl); + t = __riscv_vfmul(t, x2, vl); + t = __riscv_vfrsub(t, 1.5, vl); + y = __riscv_vfmul_mu(mask, y, t, y, vl); + } + return y; +} + +} // namespace detail + +template +struct Sqrt32f +{ + using T = RVV_T; + static constexpr size_t iter_times = 2; +}; + +template +struct Sqrt64f +{ + using T = RVV_T; + static constexpr size_t iter_times = 3; +}; + +template +inline int sqrt(const Elem* src, Elem* dst, int _len) +{ + size_t vl; + for (size_t len = _len; len > 0; len -= vl, src += vl, dst += vl) + { + vl = SQRT_T::T::setvl(len); + auto x = SQRT_T::T::vload(src, vl); + SQRT_T::T::vstore(dst, detail::sqrt(x, vl), vl); + } + + return CV_HAL_ERROR_OK; +} + +template +inline int invSqrt(const Elem* src, Elem* dst, int _len) +{ + size_t vl; + for (size_t len = _len; len > 0; len -= vl, src += vl, dst += vl) + { + vl = SQRT_T::T::setvl(len); + auto x = SQRT_T::T::vload(src, vl); + SQRT_T::T::vstore(dst, detail::invSqrt(x, vl), vl); + } + + return CV_HAL_ERROR_OK; +} + +}} // namespace cv::cv_hal_rvv + +#endif // OPENCV_HAL_RVV_SQRT_HPP_INCLUDED