From b5f5540e8ab5f94afc43e30b77f42afb90635fe4 Mon Sep 17 00:00:00 2001
From: Daniil Anufriev <danufriev2004@gmail.com>
Date: Fri, 21 Feb 2025 17:36:54 +0300
Subject: [PATCH] Merge pull request #26886 from sk1er52:feature/exp64f
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Enable SIMD_SCALABLE for exp and sqrt #26886

### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [x] The PR is proposed to the proper branch
- [x] There is a reference to the original bug report and related work
- [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [x] The feature is well documented and sample code can be built with the project CMake
```
CPU - Banana Pi k1, compiler - clang 18.1.4
```
```
Geometric mean (ms)

              Name of Test               baseline  hal     ui      hal         ui
                                                                    vs         vs
                                                                 baseline   baseline
                                                                (x-factor) (x-factor)
Exp::ExpFixture::(127x61, 32FC1)          0.358     --   0.033      --       10.70
Exp::ExpFixture::(640x480, 32FC1)         14.304    --   1.167      --       12.26
Exp::ExpFixture::(1280x720, 32FC1)        42.785    --   3.538      --       12.09
Exp::ExpFixture::(1920x1080, 32FC1)       96.206    --   7.927      --       12.14
Exp::ExpFixture::(127x61, 64FC1)          0.433   0.050  0.098     8.59       4.40
Exp::ExpFixture::(640x480, 64FC1)         17.315  1.935  3.813     8.95       4.54
Exp::ExpFixture::(1280x720, 64FC1)        52.181  5.877  11.519    8.88       4.53
Exp::ExpFixture::(1920x1080, 64FC1)      117.082  13.157 25.854    8.90       4.53
```
Additionally, this PR brings Sqrt optimization with UI:
```
Geometric mean (ms)

              Name of Test                     baseline    ui       ui
                                                                    vs
                                                                 baseline
                                                                (x-factor)
Sqrt::SqrtFixture::(127x61, 5, false)            0.111   0.027     4.11
Sqrt::SqrtFixture::(127x61, 6, false)            0.149   0.053     2.82
Sqrt::SqrtFixture::(640x480, 5, false)           4.374   0.967     4.52
Sqrt::SqrtFixture::(640x480, 6, false)           5.885   2.046     2.88
Sqrt::SqrtFixture::(1280x720, 5, false)          12.960  2.915     4.45
Sqrt::SqrtFixture::(1280x720, 6, false)          17.648  6.107     2.89
Sqrt::SqrtFixture::(1920x1080, 5, false)         29.178  6.524     4.47
Sqrt::SqrtFixture::(1920x1080, 6, false)         39.709  13.670    2.90
```

Reference
Muller, J.-M. Elementary Functions: Algorithms and Implementation. 2nd ed. Boston: Birkhäuser, 2006.
https://www.springer.com/gp/book/9780817643720
---
 modules/core/perf/perf_arithm.cpp        | 42 ++++++++++++++++++++----
 modules/core/src/mathfuncs_core.simd.hpp |  8 ++---
 2 files changed, 40 insertions(+), 10 deletions(-)

diff --git a/modules/core/perf/perf_arithm.cpp b/modules/core/perf/perf_arithm.cpp
index 6cc25a3476..b9decbfd4e 100644
--- a/modules/core/perf/perf_arithm.cpp
+++ b/modules/core/perf/perf_arithm.cpp
@@ -706,22 +706,27 @@ INSTANTIATE_TEST_CASE_P(/*nothing*/ , ArithmMixedTest,
     )
 );
 
-typedef Size_MatType InvSqrtFixture;
-PERF_TEST_P(InvSqrtFixture, InvSqrt, testing::Combine(
-    testing::Values(TYPICAL_MAT_SIZES),
-    testing::Values(CV_32FC1, CV_64FC1)))
-{
+typedef perf::TestBaseWithParam<std::tuple<cv::Size, int, bool>> SqrtFixture;
+PERF_TEST_P_(SqrtFixture, Sqrt) {
     Size sz = get<0>(GetParam());
     int type = get<1>(GetParam());
+    bool inverse = get<2>(GetParam());
 
     Mat src(sz, type), dst(sz, type);
     randu(src, FLT_EPSILON, 1000);
     declare.in(src).out(dst);
 
-    TEST_CYCLE() cv::pow(src, -0.5, dst);
+    TEST_CYCLE() cv::pow(src, inverse ? -0.5 : 0.5, dst);
 
     SANITY_CHECK_NOTHING();
 }
+INSTANTIATE_TEST_CASE_P(/*nothing*/ , SqrtFixture,
+    testing::Combine(
+        testing::Values(TYPICAL_MAT_SIZES),
+        testing::Values(CV_32FC1, CV_64FC1),
+        testing::Bool()
+    )
+);
 
 ///////////// Rotate ////////////////////////
 
@@ -815,4 +820,29 @@ INSTANTIATE_TEST_CASE_P(/*nothing*/ , PatchNaNsFixture,
     )
 );
 
+//////////////EXP////////////
+
+typedef Size_MatType ExpFixture;
+
+PERF_TEST_P(ExpFixture, Exp,
+    testing::Combine(testing::Values(TYPICAL_MAT_SIZES), testing::Values(CV_32F, CV_64F)))
+{
+    cv::Size size = std::get<0>(GetParam());
+    int type = std::get<1>(GetParam());
+
+    cv::Mat src(size, type);
+    cv::Mat dst(size, type);
+
+    declare.in(src).out(dst);
+
+    cv::randu(src, -5.0, 5.0);
+
+    TEST_CYCLE()
+    {
+        cv::exp(src, dst);
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
 } // namespace
diff --git a/modules/core/src/mathfuncs_core.simd.hpp b/modules/core/src/mathfuncs_core.simd.hpp
index 3fa3cba1b8..f92234f140 100644
--- a/modules/core/src/mathfuncs_core.simd.hpp
+++ b/modules/core/src/mathfuncs_core.simd.hpp
@@ -396,7 +396,7 @@ void sqrt32f(const float* src, float* dst, int len)
 
     int i = 0;
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     const int VECSZ = VTraits<v_float32>::vlanes();
     for( ; i < len; i += VECSZ*2 )
     {
@@ -425,7 +425,7 @@ void sqrt64f(const double* src, double* dst, int len)
 
     int i = 0;
 
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
     const int VECSZ = VTraits<v_float64>::vlanes();
     for( ; i < len; i += VECSZ*2 )
     {
@@ -527,7 +527,7 @@ void exp32f( const float *_x, float *y, int n )
     float maxval = (float)(exp_max_val/exp_prescale);
     float postscale = (float)exp_postscale;
 
-#if CV_SIMD
+#if (CV_SIMD || CV_SIMD_SCALABLE)
     const int VECSZ = VTraits<v_float32>::vlanes();
     const v_float32 vprescale = vx_setall_f32((float)exp_prescale);
     const v_float32 vpostscale = vx_setall_f32((float)exp_postscale);
@@ -641,7 +641,7 @@ void exp64f( const double *_x, double *y, int n )
     double minval = (-exp_max_val/exp_prescale);
     double maxval = (exp_max_val/exp_prescale);
 
-#if CV_SIMD_64F
+#if (CV_SIMD_64F || CV_SIMD_SCALABLE_64F)
     const int VECSZ = VTraits<v_float64>::vlanes();
     const v_float64 vprescale = vx_setall_f64(exp_prescale);
     const v_float64 vpostscale = vx_setall_f64(exp_postscale);