From e30697fd42b36960ed0fcf5d2c927f11e6f191bc Mon Sep 17 00:00:00 2001
From: GenshinImpactStarts
 <147074368+GenshinImpactStarts@users.noreply.github.com>
Date: Thu, 13 Mar 2025 13:34:11 +0800
Subject: [PATCH] Merge pull request #27002 from GenshinImpactStarts:magnitude

[HAL RVV] impl magnitude | add perf test #27002

Implement through the existing `cv_hal_magnitude32f` and `cv_hal_magnitude64f` interfaces.

**UPDATE**: UI is enabled. The only difference between UI and HAL now is HAL use a approximate `sqrt`.

Perf test done on MUSE-PI.

```sh
$ opencv_test_core --gtest_filter="*Magnitude*"
$ opencv_perf_core --gtest_filter="*Magnitude*" --perf_min_samples=300 --perf_force_samples=300
```

Test result between enabled UI and HAL:
```
                 Name of Test                     ui    rvv      rvv
                                                                  vs
                                                                  ui
                                                              (x-factor)
Magnitude::MagnitudeFixture::(127x61, 32FC1)    0.029  0.016     1.75
Magnitude::MagnitudeFixture::(127x61, 64FC1)    0.057  0.036     1.57
Magnitude::MagnitudeFixture::(640x480, 32FC1)   1.063  0.648     1.64
Magnitude::MagnitudeFixture::(640x480, 64FC1)   2.261  1.530     1.48
Magnitude::MagnitudeFixture::(1280x720, 32FC1)  3.261  2.118     1.54
Magnitude::MagnitudeFixture::(1280x720, 64FC1)  6.802  4.682     1.45
Magnitude::MagnitudeFixture::(1920x1080, 32FC1) 7.287  4.738     1.54
Magnitude::MagnitudeFixture::(1920x1080, 64FC1) 15.226 10.334    1.47
```

Test result before and after enabling UI:
```
                 Name of Test                    orig    pr       pr
                                                                  vs
                                                                 orig
                                                              (x-factor)
Magnitude::MagnitudeFixture::(127x61, 32FC1)    0.032  0.029     1.11
Magnitude::MagnitudeFixture::(127x61, 64FC1)    0.067  0.057     1.17
Magnitude::MagnitudeFixture::(640x480, 32FC1)   1.228  1.063     1.16
Magnitude::MagnitudeFixture::(640x480, 64FC1)   2.786  2.261     1.23
Magnitude::MagnitudeFixture::(1280x720, 32FC1)  3.762  3.261     1.15
Magnitude::MagnitudeFixture::(1280x720, 64FC1)  8.549  6.802     1.26
Magnitude::MagnitudeFixture::(1920x1080, 32FC1) 8.408  7.287     1.15
Magnitude::MagnitudeFixture::(1920x1080, 64FC1) 18.884 15.226    1.24
```

### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [ ] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
---
 3rdparty/hal_rvv/hal_rvv.hpp               |  1 +
 3rdparty/hal_rvv/hal_rvv_1p0/magnitude.hpp | 42 ++++++++++++++++++++++
 3rdparty/hal_rvv/hal_rvv_1p0/sqrt.hpp      | 12 ++++---
 modules/core/perf/perf_math.cpp            | 21 +++++++++++
 modules/core/src/mathfuncs_core.simd.hpp   |  4 +--
 5 files changed, 73 insertions(+), 7 deletions(-)
 create mode 100644 3rdparty/hal_rvv/hal_rvv_1p0/magnitude.hpp

diff --git a/3rdparty/hal_rvv/hal_rvv.hpp b/3rdparty/hal_rvv/hal_rvv.hpp
index 57d2ccfee5..83b1ea272c 100644
--- a/3rdparty/hal_rvv/hal_rvv.hpp
+++ b/3rdparty/hal_rvv/hal_rvv.hpp
@@ -30,6 +30,7 @@
 #include "hal_rvv_1p0/minmax.hpp" // core
 #include "hal_rvv_1p0/atan.hpp" // core
 #include "hal_rvv_1p0/split.hpp" // core
+#include "hal_rvv_1p0/magnitude.hpp" // core
 #include "hal_rvv_1p0/flip.hpp" // core
 #include "hal_rvv_1p0/lut.hpp" // core
 #include "hal_rvv_1p0/exp.hpp" // core
diff --git a/3rdparty/hal_rvv/hal_rvv_1p0/magnitude.hpp b/3rdparty/hal_rvv/hal_rvv_1p0/magnitude.hpp
new file mode 100644
index 0000000000..eb814c1b77
--- /dev/null
+++ b/3rdparty/hal_rvv/hal_rvv_1p0/magnitude.hpp
@@ -0,0 +1,42 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences.
+
+#ifndef OPENCV_HAL_RVV_MAGNITUDE_HPP_INCLUDED
+#define OPENCV_HAL_RVV_MAGNITUDE_HPP_INCLUDED
+
+#include <riscv_vector.h>
+
+#include "hal_rvv_1p0/sqrt.hpp"
+#include "hal_rvv_1p0/types.hpp"
+
+namespace cv { namespace cv_hal_rvv {
+
+#undef cv_hal_magnitude32f
+#define cv_hal_magnitude32f cv::cv_hal_rvv::magnitude<cv::cv_hal_rvv::Sqrt32f<cv::cv_hal_rvv::RVV_F32M8>>
+#undef cv_hal_magnitude64f
+#define cv_hal_magnitude64f cv::cv_hal_rvv::magnitude<cv::cv_hal_rvv::Sqrt64f<cv::cv_hal_rvv::RVV_F64M8>>
+
+template <typename SQRT_T, typename T = typename SQRT_T::T::ElemType>
+inline int magnitude(const T* x, const T* y, T* dst, int len)
+{
+    size_t vl;
+    for (; len > 0; len -= (int)vl, x += vl, y += vl, dst += vl)
+    {
+        vl = SQRT_T::T::setvl(len);
+
+        auto vx = SQRT_T::T::vload(x, vl);
+        auto vy = SQRT_T::T::vload(y, vl);
+
+        auto vmag = detail::sqrt<SQRT_T::iter_times>(__riscv_vfmadd(vx, vx, __riscv_vfmul(vy, vy, vl), vl), vl);
+        SQRT_T::T::vstore(dst, vmag, vl);
+    }
+
+    return CV_HAL_ERROR_OK;
+}
+
+}}  // namespace cv::cv_hal_rvv
+
+#endif  // OPENCV_HAL_RVV_MAGNITUDE_HPP_INCLUDED
diff --git a/3rdparty/hal_rvv/hal_rvv_1p0/sqrt.hpp b/3rdparty/hal_rvv/hal_rvv_1p0/sqrt.hpp
index 9a2e5d6bfe..9ed72f6818 100644
--- a/3rdparty/hal_rvv/hal_rvv_1p0/sqrt.hpp
+++ b/3rdparty/hal_rvv/hal_rvv_1p0/sqrt.hpp
@@ -45,11 +45,12 @@ inline VEC_T sqrt(VEC_T x, size_t vl)
         t = __riscv_vfrsub(t, 1.5, vl);
         y = __riscv_vfmul(t, y, vl);
     }
-    // just to prevent the compiler from calculating mask before the invSqrt, which will run out
+    // just to prevent the compiler from calculating mask before the iteration, which will run out
     // of registers and cause memory access.
     asm volatile("" ::: "memory");
-    auto mask = __riscv_vmfne(x, 0.0, vl);
-    mask = __riscv_vmfne_mu(mask, mask, x, INFINITY, vl);
+    auto classified = __riscv_vfclass(x, vl);
+    // block -0, +0, positive subnormal number, +inf
+    auto mask = __riscv_vmseq(__riscv_vand(classified, 0b10111000, vl), 0, vl);
     return __riscv_vfmul_mu(mask, x, x, y, vl);
 }
 
@@ -58,8 +59,9 @@ inline VEC_T sqrt(VEC_T x, size_t vl)
 template <size_t iter_times, typename VEC_T>
 inline VEC_T invSqrt(VEC_T x, size_t vl)
 {
-    auto mask = __riscv_vmfne(x, 0.0, vl);
-    mask = __riscv_vmfne_mu(mask, mask, x, INFINITY, vl);
+    auto classified = __riscv_vfclass(x, vl);
+    // block -0, +0, positive subnormal number, +inf
+    auto mask = __riscv_vmseq(__riscv_vand(classified, 0b10111000, vl), 0, vl);
     auto x2 = __riscv_vfmul(x, 0.5, vl);
     auto y = __riscv_vfrsqrt7(x, vl);
 #pragma unroll
diff --git a/modules/core/perf/perf_math.cpp b/modules/core/perf/perf_math.cpp
index fe947aec1a..c06fda44da 100644
--- a/modules/core/perf/perf_math.cpp
+++ b/modules/core/perf/perf_math.cpp
@@ -36,6 +36,27 @@ PERF_TEST_P(VectorLength, phase64f, testing::Values(128, 1000, 128*1024, 512*102
     SANITY_CHECK(angle, 5e-5);
 }
 
+///////////// Magnitude /////////////
+
+typedef Size_MatType MagnitudeFixture;
+
+PERF_TEST_P(MagnitudeFixture, Magnitude,
+    testing::Combine(testing::Values(TYPICAL_MAT_SIZES), testing::Values(CV_32F, CV_64F)))
+{
+    cv::Size size = std::get<0>(GetParam());
+    int type = std::get<1>(GetParam());
+
+    cv::Mat x(size, type);
+    cv::Mat y(size, type);
+    cv::Mat magnitude(size, type);
+
+    declare.in(x, y, WARMUP_RNG).out(magnitude);
+
+    TEST_CYCLE() cv::magnitude(x, y, magnitude);
+
+    SANITY_CHECK_NOTHING();
+}
+
 // generates random vectors, performs Gram-Schmidt orthogonalization on them
 Mat randomOrtho(int rows, int ftype, RNG& rng)
 {
diff --git a/modules/core/src/mathfuncs_core.simd.hpp b/modules/core/src/mathfuncs_core.simd.hpp
index 41a3261c64..0d9d9272e6 100644
--- a/modules/core/src/mathfuncs_core.simd.hpp
+++ b/modules/core/src/mathfuncs_core.simd.hpp
@@ -273,7 +273,7 @@ void magnitude32f(const float* x, const float* y, float* mag, int len)
 
     int i = 0;
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     const int VECSZ = VTraits<v_float32>::vlanes();
     for( ; i < len; i += VECSZ*2 )
     {
@@ -306,7 +306,7 @@ void magnitude64f(const double* x, const double* y, double* mag, int len)
 
     int i = 0;
 
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
     const int VECSZ = VTraits<v_float64>::vlanes();
     for( ; i < len; i += VECSZ*2 )
     {