From 2090407002b4f91186d758f150e839d2c1a56677 Mon Sep 17 00:00:00 2001 From: GenshinImpactStarts <147074368+GenshinImpactStarts@users.noreply.github.com> Date: Mon, 17 Mar 2025 19:16:09 +0800 Subject: [PATCH] Merge pull request #26999 from GenshinImpactStarts:polar_to_cart [HAL RVV] unify and impl polar_to_cart | add perf test #26999 ### Summary 1. Implement through the existing `cv_hal_polarToCart32f` and `cv_hal_polarToCart64f` interfaces. 2. Add `polarToCart` performance tests 3. Make `cv::polarToCart` use CALL_HAL in the same way as `cv::cartToPolar` 4. To achieve the 3rd point, the original implementation was moved, and some modifications were made. Tested through: ```sh opencv_test_core --gtest_filter="*PolarToCart*:*Core_CartPolar_reverse*" opencv_perf_core --gtest_filter="*PolarToCart*" --perf_min_samples=300 --perf_force_samples=300 ``` ### HAL performance test ***UPDATE***: Current implementation is no more depending on vlen. **NOTE**: Due to the 4th point in the summary above, the `scalar` and `ui` test is based on the modified code of this PR. The impact of this patch on `scalar` and `ui` is evaluated in the next section, `Effect of Point 4`. Vlen 256 (Muse Pi): ``` Name of Test scalar ui rvv ui rvv vs vs scalar scalar (x-factor) (x-factor) PolarToCart::PolarToCartFixture::(127x61, 32FC1) 0.315 0.110 0.034 2.85 9.34 PolarToCart::PolarToCartFixture::(127x61, 64FC1) 0.423 0.163 0.045 2.59 9.34 PolarToCart::PolarToCartFixture::(640x480, 32FC1) 13.695 4.325 1.278 3.17 10.71 PolarToCart::PolarToCartFixture::(640x480, 64FC1) 17.719 7.118 2.105 2.49 8.42 PolarToCart::PolarToCartFixture::(1280x720, 32FC1) 40.678 13.114 3.977 3.10 10.23 PolarToCart::PolarToCartFixture::(1280x720, 64FC1) 53.124 21.298 6.519 2.49 8.15 PolarToCart::PolarToCartFixture::(1920x1080, 32FC1) 95.158 29.465 8.894 3.23 10.70 PolarToCart::PolarToCartFixture::(1920x1080, 64FC1) 119.262 47.743 14.129 2.50 8.44 ``` ### Effect of Point 4 To make `cv::polarToCart` behave the same as `cv::cartToPolar`, the implementation detail of the former has been moved to the latter's location (from `mathfuncs.cpp` to `mathfuncs_core.simd.hpp`). #### Reason for Changes: This function works as follows: $y = \text{mag} \times \sin(\text{angle})$ and $x = \text{mag} \times \cos(\text{angle})$. The original implementation first calculates the values of $\sin$ and $\cos$, storing the results in the output buffers $x$ and $y$, and then multiplies the result by $\text{mag}$. However, when the function is used as an in-place operation (one of the output buffers is also an input buffer), the original implementation allocates an extra buffer to store the $\sin$ and $\cos$ values in case the $\text{mag}$ value gets overwritten. This extra buffer allocation prevents `cv::polarToCart` from functioning in the same way as `cv::cartToPolar`. Therefore, the multiplication is now performed immediately without storing intermediate values. Since the original implementation also had AVX2 optimizations, I have applied the same optimizations to the AVX2 version of this implementation. ***UPDATE***: UI use v_sincos from #25892 now. The original implementation has AVX2 optimizations but is slower much than current UI so it's removed, and AVX2 perf test is below. Scalar implementation isn't changed because it's faster than using UI's method. #### Test Result `scalar` and `ui` test is done on Muse PI, and AVX2 test is done on Intel(R) Xeon(R) Gold 6140 CPU @ 2.30GHz. `scalar` test: ``` Name of Test orig pr pr vs orig (x-factor) PolarToCart::PolarToCartFixture::(127x61, 32FC1) 0.333 0.294 1.13 PolarToCart::PolarToCartFixture::(127x61, 64FC1) 0.385 0.403 0.96 PolarToCart::PolarToCartFixture::(640x480, 32FC1) 14.749 12.343 1.19 PolarToCart::PolarToCartFixture::(640x480, 64FC1) 19.419 16.743 1.16 PolarToCart::PolarToCartFixture::(1280x720, 32FC1) 44.155 37.822 1.17 PolarToCart::PolarToCartFixture::(1280x720, 64FC1) 62.108 50.358 1.23 PolarToCart::PolarToCartFixture::(1920x1080, 32FC1) 99.011 85.769 1.15 PolarToCart::PolarToCartFixture::(1920x1080, 64FC1) 127.740 112.874 1.13 ``` `ui` test: ``` Name of Test orig pr pr vs orig (x-factor) PolarToCart::PolarToCartFixture::(127x61, 32FC1) 0.306 0.110 2.77 PolarToCart::PolarToCartFixture::(127x61, 64FC1) 0.455 0.163 2.79 PolarToCart::PolarToCartFixture::(640x480, 32FC1) 13.381 4.325 3.09 PolarToCart::PolarToCartFixture::(640x480, 64FC1) 21.851 7.118 3.07 PolarToCart::PolarToCartFixture::(1280x720, 32FC1) 39.975 13.114 3.05 PolarToCart::PolarToCartFixture::(1280x720, 64FC1) 67.006 21.298 3.15 PolarToCart::PolarToCartFixture::(1920x1080, 32FC1) 90.362 29.465 3.07 PolarToCart::PolarToCartFixture::(1920x1080, 64FC1) 129.637 47.743 2.72 ``` AVX2 test: ``` Name of Test orig pr pr vs orig (x-factor) PolarToCart::PolarToCartFixture::(127x61, 32FC1) 0.019 0.009 2.11 PolarToCart::PolarToCartFixture::(127x61, 64FC1) 0.022 0.013 1.74 PolarToCart::PolarToCartFixture::(640x480, 32FC1) 0.788 0.355 2.22 PolarToCart::PolarToCartFixture::(640x480, 64FC1) 1.102 0.618 1.78 PolarToCart::PolarToCartFixture::(1280x720, 32FC1) 2.383 1.042 2.29 PolarToCart::PolarToCartFixture::(1280x720, 64FC1) 3.758 2.316 1.62 PolarToCart::PolarToCartFixture::(1920x1080, 32FC1) 5.577 2.559 2.18 PolarToCart::PolarToCartFixture::(1920x1080, 64FC1) 9.710 6.424 1.51 ``` A slight performance loss occurs because the check for whether $mag$ is nullptr is performed with every calculation, instead of being done once per batch. This is to reuse current `SinCos_32f` function. ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [ ] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake --- 3rdparty/hal_rvv/hal_rvv.hpp | 1 + .../hal_rvv/hal_rvv_1p0/polar_to_cart.hpp | 53 +++++ 3rdparty/hal_rvv/hal_rvv_1p0/pyramids.hpp | 4 +- 3rdparty/hal_rvv/hal_rvv_1p0/sincos.hpp | 72 ++++++ 3rdparty/hal_rvv/hal_rvv_1p0/types.hpp | 78 +++++- modules/core/perf/perf_math.cpp | 22 ++ modules/core/src/mathfuncs.cpp | 223 +----------------- modules/core/src/mathfuncs_core.dispatch.cpp | 20 ++ modules/core/src/mathfuncs_core.simd.hpp | 163 +++++++++++++ 9 files changed, 411 insertions(+), 225 deletions(-) create mode 100644 3rdparty/hal_rvv/hal_rvv_1p0/polar_to_cart.hpp create mode 100644 3rdparty/hal_rvv/hal_rvv_1p0/sincos.hpp diff --git a/3rdparty/hal_rvv/hal_rvv.hpp b/3rdparty/hal_rvv/hal_rvv.hpp index 92eea59aae..45c7da446d 100644 --- a/3rdparty/hal_rvv/hal_rvv.hpp +++ b/3rdparty/hal_rvv/hal_rvv.hpp @@ -33,6 +33,7 @@ #include "hal_rvv_1p0/split.hpp" // core #include "hal_rvv_1p0/magnitude.hpp" // core #include "hal_rvv_1p0/cart_to_polar.hpp" // core +#include "hal_rvv_1p0/polar_to_cart.hpp" // core #include "hal_rvv_1p0/flip.hpp" // core #include "hal_rvv_1p0/lut.hpp" // core #include "hal_rvv_1p0/exp.hpp" // core diff --git a/3rdparty/hal_rvv/hal_rvv_1p0/polar_to_cart.hpp b/3rdparty/hal_rvv/hal_rvv_1p0/polar_to_cart.hpp new file mode 100644 index 0000000000..feab2047e5 --- /dev/null +++ b/3rdparty/hal_rvv/hal_rvv_1p0/polar_to_cart.hpp @@ -0,0 +1,53 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences. + +#ifndef OPENCV_HAL_RVV_POLAR_TO_CART_HPP_INCLUDED +#define OPENCV_HAL_RVV_POLAR_TO_CART_HPP_INCLUDED + +#include +#include "hal_rvv_1p0/sincos.hpp" +#include "hal_rvv_1p0/types.hpp" + +namespace cv { namespace cv_hal_rvv { + +#undef cv_hal_polarToCart32f +#define cv_hal_polarToCart32f cv::cv_hal_rvv::polarToCart +#undef cv_hal_polarToCart64f +#define cv_hal_polarToCart64f cv::cv_hal_rvv::polarToCart + +template +inline int + polarToCart(const Elem* mag, const Elem* angle, Elem* x, Elem* y, int len, bool angleInDegrees) +{ + using T = RVV_F32M4; + const auto sincos_scale = angleInDegrees ? detail::sincos_deg_scale : detail::sincos_rad_scale; + + size_t vl; + auto cos_p2 = T::vmv(detail::sincos_cos_p2, T::setvlmax()); + auto cos_p0 = T::vmv(detail::sincos_cos_p0, T::setvlmax()); + for (; len > 0; len -= (int)vl, angle += vl, x += vl, y += vl) + { + vl = RVV_T::setvl(len); + auto vangle = T::cast(RVV_T::vload(angle, vl), vl); + T::VecType vsin, vcos; + detail::SinCos32f(vangle, vsin, vcos, sincos_scale, cos_p2, cos_p0, vl); + if (mag) + { + auto vmag = T::cast(RVV_T::vload(mag, vl), vl); + vsin = __riscv_vfmul(vsin, vmag, vl); + vcos = __riscv_vfmul(vcos, vmag, vl); + mag += vl; + } + RVV_T::vstore(x, RVV_T::cast(vcos, vl), vl); + RVV_T::vstore(y, RVV_T::cast(vsin, vl), vl); + } + + return CV_HAL_ERROR_OK; +} + +}} // namespace cv::cv_hal_rvv + +#endif // OPENCV_HAL_RVV_POLAR_TO_CART_HPP_INCLUDED diff --git a/3rdparty/hal_rvv/hal_rvv_1p0/pyramids.hpp b/3rdparty/hal_rvv/hal_rvv_1p0/pyramids.hpp index 1b07cabfb2..a349d341c5 100644 --- a/3rdparty/hal_rvv/hal_rvv_1p0/pyramids.hpp +++ b/3rdparty/hal_rvv/hal_rvv_1p0/pyramids.hpp @@ -25,8 +25,8 @@ template<> struct rvv using WT = RVV_SameLen; using MT = RVV_SameLen; - static inline WT::VecType vcvt_T_WT(T::VecType a, size_t b) { return WT::cast(MT::cast(a, b), b); } - static inline T::VecType vcvt_WT_T(WT::VecType a, int b, size_t c) { return T::cast(MT::cast(__riscv_vsra(__riscv_vadd(a, 1 << (b - 1), c), b, c), c), c); } + static inline WT::VecType vcvt_T_WT(T::VecType a, size_t b) { return WT::reinterpret(MT::cast(a, b)); } + static inline T::VecType vcvt_WT_T(WT::VecType a, int b, size_t c) { return T::cast(MT::reinterpret(__riscv_vsra(__riscv_vadd(a, 1 << (b - 1), c), b, c)), c); } static inline WT::VecType down0(WT::VecType vec_src0, WT::VecType vec_src1, WT::VecType vec_src2, WT::VecType vec_src3, WT::VecType vec_src4, size_t vl) { return __riscv_vadd(__riscv_vadd(__riscv_vadd(vec_src0, vec_src4, vl), __riscv_vadd(vec_src2, vec_src2, vl), vl), __riscv_vsll(__riscv_vadd(__riscv_vadd(vec_src1, vec_src2, vl), vec_src3, vl), 2, vl), vl); diff --git a/3rdparty/hal_rvv/hal_rvv_1p0/sincos.hpp b/3rdparty/hal_rvv/hal_rvv_1p0/sincos.hpp new file mode 100644 index 0000000000..776d58f42c --- /dev/null +++ b/3rdparty/hal_rvv/hal_rvv_1p0/sincos.hpp @@ -0,0 +1,72 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level +// directory of this distribution and at http://opencv.org/license.html. + +// Copyright (C) 2025, Institute of Software, Chinese Academy of Sciences. + +#ifndef OPENCV_HAL_RVV_SINCOS_HPP_INCLUDED +#define OPENCV_HAL_RVV_SINCOS_HPP_INCLUDED + +#include +#include "hal_rvv_1p0/types.hpp" + +namespace cv { namespace cv_hal_rvv { namespace detail { + +static constexpr size_t sincos_mask = 0x3; + +static constexpr float sincos_rad_scale = 2.f / CV_PI; +static constexpr float sincos_deg_scale = 2.f / 180.f; + +// Taylor expansion coefficients for sin(x*pi/2) and cos(x*pi/2) +static constexpr double sincos_sin_p7 = -0.004681754135319; +static constexpr double sincos_sin_p5 = 0.079692626246167; +static constexpr double sincos_sin_p3 = -0.645964097506246; +static constexpr double sincos_sin_p1 = 1.570796326794897; + +static constexpr double sincos_cos_p8 = 0.000919260274839; +static constexpr double sincos_cos_p6 = -0.020863480763353; +static constexpr double sincos_cos_p4 = 0.253669507901048; +static constexpr double sincos_cos_p2 = -1.233700550136170; +static constexpr double sincos_cos_p0 = 1.000000000000000; + +// Taylor expansion and angle sum identity +// Use 7 LMUL registers (can be reduced to 5 by splitting fmadd to fadd and fmul) +template +static inline void + SinCos32f(T angle, T& sinval, T& cosval, float scale, T cos_p2, T cos_p0, size_t vl) +{ + angle = __riscv_vfmul(angle, scale, vl); + auto round_angle = RVV_ToInt::cast(angle, vl); + auto delta_angle = __riscv_vfsub(angle, RVV_T::cast(round_angle, vl), vl); + auto delta_angle2 = __riscv_vfmul(delta_angle, delta_angle, vl); + + auto sin = __riscv_vfadd(__riscv_vfmul(delta_angle2, sincos_sin_p7, vl), sincos_sin_p5, vl); + sin = __riscv_vfadd(__riscv_vfmul(delta_angle2, sin, vl), sincos_sin_p3, vl); + sin = __riscv_vfadd(__riscv_vfmul(delta_angle2, sin, vl), sincos_sin_p1, vl); + sin = __riscv_vfmul(delta_angle, sin, vl); + + auto cos = __riscv_vfadd(__riscv_vfmul(delta_angle2, sincos_cos_p8, vl), sincos_cos_p6, vl); + cos = __riscv_vfadd(__riscv_vfmul(delta_angle2, cos, vl), sincos_cos_p4, vl); + cos = __riscv_vfmadd(cos, delta_angle2, cos_p2, vl); + cos = __riscv_vfmadd(cos, delta_angle2, cos_p0, vl); + + // idx = 0: sinval = sin, cosval = cos + // idx = 1: sinval = cos, cosval = -sin + // idx = 2: sinval = -sin, cosval = -cos + // idx = 3: sinval = -cos, cosval = sin + auto idx = __riscv_vand(round_angle, sincos_mask, vl); + auto idx1 = __riscv_vmseq(idx, 1, vl); + auto idx2 = __riscv_vmseq(idx, 2, vl); + auto idx3 = __riscv_vmseq(idx, 3, vl); + + auto idx13 = __riscv_vmor(idx1, idx3, vl); + sinval = __riscv_vmerge(sin, cos, idx13, vl); + cosval = __riscv_vmerge(cos, sin, idx13, vl); + + sinval = __riscv_vfneg_mu(__riscv_vmor(idx2, idx3, vl), sinval, sinval, vl); + cosval = __riscv_vfneg_mu(__riscv_vmor(idx1, idx2, vl), cosval, cosval, vl); +} + +}}} // namespace cv::cv_hal_rvv::detail + +#endif // OPENCV_HAL_RVV_SINCOS_HPP_INCLUDED diff --git a/3rdparty/hal_rvv/hal_rvv_1p0/types.hpp b/3rdparty/hal_rvv/hal_rvv_1p0/types.hpp index e6d4bd99fb..71182e0e4f 100644 --- a/3rdparty/hal_rvv/hal_rvv_1p0/types.hpp +++ b/3rdparty/hal_rvv/hal_rvv_1p0/types.hpp @@ -96,6 +96,22 @@ template using RVV_SameLen = RVV; +template struct RVV_ToIntHelper; +template struct RVV_ToUintHelper; +template struct RVV_ToFloatHelper; + +template +using RVV_ToInt = + RVV::type, RVV_T::lmul>; + +template +using RVV_ToUint = + RVV::type, RVV_T::lmul>; + +template +using RVV_ToFloat = + RVV::type, RVV_T::lmul>; + template using RVV_BaseType = RVV; @@ -203,13 +219,15 @@ static inline BaseType vredmax(VecType vs2, BaseType vs1, size_t vl) { using BoolType = HAL_RVV_BOOL_TYPE(__VA_ARGS__); \ using BaseType = v##VEC_TYPE##m1_t; \ \ - static constexpr size_t lmul = LMUL_TYPE; \ + static constexpr RVV_LMUL lmul = LMUL_TYPE; \ \ HAL_RVV_SIZE_RELATED(EEW, TYPE, LMUL, __VA_ARGS__) \ HAL_RVV_SIZE_UNRELATED(__VA_ARGS__) \ \ template \ inline static VecType cast(FROM v, size_t vl); \ + template \ + inline static VecType reinterpret(FROM v); \ }; \ \ template <> \ @@ -304,6 +322,20 @@ HAL_RVV_DEFINE_ONE( float, float32, LMUL_f2, 32, f32, mf2, HAL_RVV_FLOAT_PARAM) // -------------------------------Define cast-------------------------------- +template <> struct RVV_ToIntHelper<1> {using type = int8_t;}; +template <> struct RVV_ToIntHelper<2> {using type = int16_t;}; +template <> struct RVV_ToIntHelper<4> {using type = int32_t;}; +template <> struct RVV_ToIntHelper<8> {using type = int64_t;}; + +template <> struct RVV_ToUintHelper<1> {using type = uint8_t;}; +template <> struct RVV_ToUintHelper<2> {using type = uint16_t;}; +template <> struct RVV_ToUintHelper<4> {using type = uint32_t;}; +template <> struct RVV_ToUintHelper<8> {using type = uint64_t;}; + +template <> struct RVV_ToFloatHelper<2> {using type = _Float16;}; +template <> struct RVV_ToFloatHelper<4> {using type = float;}; +template <> struct RVV_ToFloatHelper<8> {using type = double;}; + #define HAL_RVV_CVT(ONE, TWO) \ template <> \ inline ONE::VecType ONE::cast(TWO::VecType v, size_t vl) { return __riscv_vncvt_x(v, vl); } \ @@ -441,18 +473,52 @@ HAL_RVV_CVT(RVV_F32MF2, RVV_F64M1) #undef HAL_RVV_CVT -#define HAL_RVV_CVT(A, B, A_TYPE, B_TYPE, LMUL_TYPE, LMUL) \ +#define HAL_RVV_CVT(A, B, A_TYPE, B_TYPE, LMUL_TYPE, LMUL, IS_U) \ template <> \ inline RVV::VecType RVV::cast( \ - RVV::VecType v, [[maybe_unused]] size_t vl \ + RVV::VecType v, size_t vl \ ) { \ - return __riscv_vreinterpret_##A_TYPE##LMUL(v); \ + return __riscv_vfcvt_f_x##IS_U##_v_##A_TYPE##LMUL(v, vl); \ } \ template <> \ inline RVV::VecType RVV::cast( \ - RVV::VecType v, [[maybe_unused]] size_t vl \ + RVV::VecType v, size_t vl \ ) { \ - return __riscv_vreinterpret_##B_TYPE##LMUL(v); \ + return __riscv_vfcvt_x##IS_U##_f_v_##B_TYPE##LMUL(v, vl); \ + } + +HAL_RVV_CVT( float, int32_t, f32, i32, LMUL_1, m1, ) +HAL_RVV_CVT( float, int32_t, f32, i32, LMUL_2, m2, ) +HAL_RVV_CVT( float, int32_t, f32, i32, LMUL_4, m4, ) +HAL_RVV_CVT( float, int32_t, f32, i32, LMUL_8, m8, ) +HAL_RVV_CVT( float, int32_t, f32, i32, LMUL_f2, mf2, ) + +HAL_RVV_CVT( float, uint32_t, f32, u32, LMUL_1, m1, u) +HAL_RVV_CVT( float, uint32_t, f32, u32, LMUL_2, m2, u) +HAL_RVV_CVT( float, uint32_t, f32, u32, LMUL_4, m4, u) +HAL_RVV_CVT( float, uint32_t, f32, u32, LMUL_8, m8, u) +HAL_RVV_CVT( float, uint32_t, f32, u32, LMUL_f2, mf2, u) + +HAL_RVV_CVT(double, int64_t, f64, i64, LMUL_1, m1, ) +HAL_RVV_CVT(double, int64_t, f64, i64, LMUL_2, m2, ) +HAL_RVV_CVT(double, int64_t, f64, i64, LMUL_4, m4, ) +HAL_RVV_CVT(double, int64_t, f64, i64, LMUL_8, m8, ) + +HAL_RVV_CVT(double, uint64_t, f64, u64, LMUL_1, m1, u) +HAL_RVV_CVT(double, uint64_t, f64, u64, LMUL_2, m2, u) +HAL_RVV_CVT(double, uint64_t, f64, u64, LMUL_4, m4, u) +HAL_RVV_CVT(double, uint64_t, f64, u64, LMUL_8, m8, u) + +#undef HAL_RVV_CVT + +#define HAL_RVV_CVT(A, B, A_TYPE, B_TYPE, LMUL_TYPE, LMUL) \ + template <> \ + inline RVV::VecType RVV::reinterpret(RVV::VecType v) { \ + return __riscv_vreinterpret_##A_TYPE##LMUL(v); \ + } \ + template <> \ + inline RVV::VecType RVV::reinterpret(RVV::VecType v) { \ + return __riscv_vreinterpret_##B_TYPE##LMUL(v); \ } #define HAL_RVV_CVT2(A, B, A_TYPE, B_TYPE) \ diff --git a/modules/core/perf/perf_math.cpp b/modules/core/perf/perf_math.cpp index 398a3ad651..e06e281592 100644 --- a/modules/core/perf/perf_math.cpp +++ b/modules/core/perf/perf_math.cpp @@ -79,6 +79,28 @@ PERF_TEST_P(CartToPolarFixture, CartToPolar, SANITY_CHECK_NOTHING(); } +///////////// Polar to Cart ///////////// + +typedef Size_MatType PolarToCartFixture; + +PERF_TEST_P(PolarToCartFixture, PolarToCart, + testing::Combine(testing::Values(TYPICAL_MAT_SIZES), testing::Values(CV_32F, CV_64F))) +{ + cv::Size size = std::get<0>(GetParam()); + int type = std::get<1>(GetParam()); + + cv::Mat magnitude(size, type); + cv::Mat angle(size, type); + cv::Mat x(size, type); + cv::Mat y(size, type); + + declare.in(magnitude, angle, WARMUP_RNG).out(x, y); + + TEST_CYCLE() cv::polarToCart(magnitude, angle, x, y); + + SANITY_CHECK_NOTHING(); +} + // generates random vectors, performs Gram-Schmidt orthogonalization on them Mat randomOrtho(int rows, int ftype, RNG& rng) { diff --git a/modules/core/src/mathfuncs.cpp b/modules/core/src/mathfuncs.cpp index 2ba2a4001e..0403b75452 100644 --- a/modules/core/src/mathfuncs.cpp +++ b/modules/core/src/mathfuncs.cpp @@ -328,149 +328,6 @@ void cartToPolar( InputArray src1, InputArray src2, * Polar -> Cartezian * \****************************************************************************************/ -static void SinCos_32f( const float *angle, float *sinval, float* cosval, - int len, int angle_in_degrees ) -{ - const int N = 64; - - static const double sin_table[] = - { - 0.00000000000000000000, 0.09801714032956060400, - 0.19509032201612825000, 0.29028467725446233000, - 0.38268343236508978000, 0.47139673682599764000, - 0.55557023301960218000, 0.63439328416364549000, - 0.70710678118654746000, 0.77301045336273699000, - 0.83146961230254524000, 0.88192126434835494000, - 0.92387953251128674000, 0.95694033573220894000, - 0.98078528040323043000, 0.99518472667219682000, - 1.00000000000000000000, 0.99518472667219693000, - 0.98078528040323043000, 0.95694033573220894000, - 0.92387953251128674000, 0.88192126434835505000, - 0.83146961230254546000, 0.77301045336273710000, - 0.70710678118654757000, 0.63439328416364549000, - 0.55557023301960218000, 0.47139673682599786000, - 0.38268343236508989000, 0.29028467725446239000, - 0.19509032201612861000, 0.09801714032956082600, - 0.00000000000000012246, -0.09801714032956059000, - -0.19509032201612836000, -0.29028467725446211000, - -0.38268343236508967000, -0.47139673682599764000, - -0.55557023301960196000, -0.63439328416364527000, - -0.70710678118654746000, -0.77301045336273666000, - -0.83146961230254524000, -0.88192126434835494000, - -0.92387953251128652000, -0.95694033573220882000, - -0.98078528040323032000, -0.99518472667219693000, - -1.00000000000000000000, -0.99518472667219693000, - -0.98078528040323043000, -0.95694033573220894000, - -0.92387953251128663000, -0.88192126434835505000, - -0.83146961230254546000, -0.77301045336273688000, - -0.70710678118654768000, -0.63439328416364593000, - -0.55557023301960218000, -0.47139673682599792000, - -0.38268343236509039000, -0.29028467725446250000, - -0.19509032201612872000, -0.09801714032956050600, - }; - - static const double k2 = (2*CV_PI)/N; - - static const double sin_a0 = -0.166630293345647*k2*k2*k2; - static const double sin_a2 = k2; - - static const double cos_a0 = -0.499818138450326*k2*k2; - /*static const double cos_a2 = 1;*/ - - double k1; - int i = 0; - - if( !angle_in_degrees ) - k1 = N/(2*CV_PI); - else - k1 = N/360.; - -#if CV_AVX2 - if (USE_AVX2) - { - __m128d v_k1 = _mm_set1_pd(k1); - __m128d v_1 = _mm_set1_pd(1); - __m128i v_N1 = _mm_set1_epi32(N - 1); - __m128i v_N4 = _mm_set1_epi32(N >> 2); - __m128d v_sin_a0 = _mm_set1_pd(sin_a0); - __m128d v_sin_a2 = _mm_set1_pd(sin_a2); - __m128d v_cos_a0 = _mm_set1_pd(cos_a0); - - for ( ; i <= len - 4; i += 4) - { - __m128 v_angle = _mm_loadu_ps(angle + i); - - // 0-1 - __m128d v_t = _mm_mul_pd(_mm_cvtps_pd(v_angle), v_k1); - __m128i v_it = _mm_cvtpd_epi32(v_t); - v_t = _mm_sub_pd(v_t, _mm_cvtepi32_pd(v_it)); - - __m128i v_sin_idx = _mm_and_si128(v_it, v_N1); - __m128i v_cos_idx = _mm_and_si128(_mm_sub_epi32(v_N4, v_sin_idx), v_N1); - - __m128d v_t2 = _mm_mul_pd(v_t, v_t); - __m128d v_sin_b = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(v_sin_a0, v_t2), v_sin_a2), v_t); - __m128d v_cos_b = _mm_add_pd(_mm_mul_pd(v_cos_a0, v_t2), v_1); - - __m128d v_sin_a = _mm_i32gather_pd(sin_table, v_sin_idx, 8); - __m128d v_cos_a = _mm_i32gather_pd(sin_table, v_cos_idx, 8); - - __m128d v_sin_val_0 = _mm_add_pd(_mm_mul_pd(v_sin_a, v_cos_b), - _mm_mul_pd(v_cos_a, v_sin_b)); - __m128d v_cos_val_0 = _mm_sub_pd(_mm_mul_pd(v_cos_a, v_cos_b), - _mm_mul_pd(v_sin_a, v_sin_b)); - - // 2-3 - v_t = _mm_mul_pd(_mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_angle), 8))), v_k1); - v_it = _mm_cvtpd_epi32(v_t); - v_t = _mm_sub_pd(v_t, _mm_cvtepi32_pd(v_it)); - - v_sin_idx = _mm_and_si128(v_it, v_N1); - v_cos_idx = _mm_and_si128(_mm_sub_epi32(v_N4, v_sin_idx), v_N1); - - v_t2 = _mm_mul_pd(v_t, v_t); - v_sin_b = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(v_sin_a0, v_t2), v_sin_a2), v_t); - v_cos_b = _mm_add_pd(_mm_mul_pd(v_cos_a0, v_t2), v_1); - - v_sin_a = _mm_i32gather_pd(sin_table, v_sin_idx, 8); - v_cos_a = _mm_i32gather_pd(sin_table, v_cos_idx, 8); - - __m128d v_sin_val_1 = _mm_add_pd(_mm_mul_pd(v_sin_a, v_cos_b), - _mm_mul_pd(v_cos_a, v_sin_b)); - __m128d v_cos_val_1 = _mm_sub_pd(_mm_mul_pd(v_cos_a, v_cos_b), - _mm_mul_pd(v_sin_a, v_sin_b)); - - _mm_storeu_ps(sinval + i, _mm_movelh_ps(_mm_cvtpd_ps(v_sin_val_0), - _mm_cvtpd_ps(v_sin_val_1))); - _mm_storeu_ps(cosval + i, _mm_movelh_ps(_mm_cvtpd_ps(v_cos_val_0), - _mm_cvtpd_ps(v_cos_val_1))); - } - } -#endif - - for( ; i < len; i++ ) - { - double t = angle[i]*k1; - int it = cvRound(t); - t -= it; - int sin_idx = it & (N - 1); - int cos_idx = (N/4 - sin_idx) & (N - 1); - - double sin_b = (sin_a0*t*t + sin_a2)*t; - double cos_b = cos_a0*t*t + 1; - - double sin_a = sin_table[sin_idx]; - double cos_a = sin_table[cos_idx]; - - double sin_val = sin_a*cos_b + cos_a*sin_b; - double cos_val = cos_a*cos_b - sin_a*sin_b; - - sinval[i] = (float)sin_val; - cosval[i] = (float)cos_val; - } -} - - #ifdef HAVE_OPENCL static bool ocl_polarToCart( InputArray _mag, InputArray _angle, @@ -587,11 +444,13 @@ void polarToCart( InputArray src1, InputArray src2, CV_Assert(dst1.getObj() != dst2.getObj()); +#ifdef HAVE_IPP const bool isInPlace = (src1.getObj() == dst1.getObj()) || (src1.getObj() == dst2.getObj()) || (src2.getObj() == dst1.getObj()) || (src2.getObj() == dst2.getObj()); +#endif int type = src2.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); CV_Assert((depth == CV_32F || depth == CV_64F) && (src1.empty() || src1.type() == type)); @@ -610,95 +469,25 @@ void polarToCart( InputArray src1, InputArray src2, const Mat* arrays[] = {&Mag, &Angle, &X, &Y, 0}; uchar* ptrs[4] = {}; NAryMatIterator it(arrays, ptrs); - cv::AutoBuffer _buf; - float* buf[2] = {0, 0}; - int j, k, total = (int)(it.size*cn), blockSize = std::min(total, ((BLOCK_SIZE+cn-1)/cn)*cn); + int j, total = (int)(it.size*cn), blockSize = std::min(total, ((BLOCK_SIZE+cn-1)/cn)*cn); size_t esz1 = Angle.elemSize1(); - if (( depth == CV_64F ) || isInPlace) - { - _buf.allocate(blockSize*2); - buf[0] = _buf.data(); - buf[1] = buf[0] + blockSize; - } - for( size_t i = 0; i < it.nplanes; i++, ++it ) { for( j = 0; j < total; j += blockSize ) { int len = std::min(total - j, blockSize); - if (( depth == CV_32F ) && !isInPlace) + if ( depth == CV_32F ) { const float *mag = (const float*)ptrs[0], *angle = (const float*)ptrs[1]; float *x = (float*)ptrs[2], *y = (float*)ptrs[3]; - - SinCos_32f( angle, y, x, len, angleInDegrees ); - if( mag ) - { - k = 0; - -#if (CV_SIMD || CV_SIMD_SCALABLE) - int cWidth = VTraits::vlanes(); - for( ; k <= len - cWidth; k += cWidth ) - { - v_float32 v_m = vx_load(mag + k); - v_store(x + k, v_mul(vx_load(x + k), v_m)); - v_store(y + k, v_mul(vx_load(y + k), v_m)); - } - vx_cleanup(); -#endif - - for( ; k < len; k++ ) - { - float m = mag[k]; - x[k] *= m; y[k] *= m; - } - } - } - else if (( depth == CV_32F ) && isInPlace) - { - const float *mag = (const float*)ptrs[0], *angle = (const float*)ptrs[1]; - float *x = (float*)ptrs[2], *y = (float*)ptrs[3]; - - for( k = 0; k < len; k++ ) - buf[0][k] = (float)angle[k]; - - SinCos_32f( buf[0], buf[1], buf[0], len, angleInDegrees ); - if( mag ) - for( k = 0; k < len; k++ ) - { - float m = mag[k]; - x[k] = buf[0][k]*m; y[k] = buf[1][k]*m; - } - else - { - std::memcpy(x, buf[0], sizeof(float) * len); - std::memcpy(y, buf[1], sizeof(float) * len); - } + hal::polarToCart32f( mag, angle, x, y, len, angleInDegrees ); } else { const double *mag = (const double*)ptrs[0], *angle = (const double*)ptrs[1]; double *x = (double*)ptrs[2], *y = (double*)ptrs[3]; - - for( k = 0; k < len; k++ ) - buf[0][k] = (float)angle[k]; - - SinCos_32f( buf[0], buf[1], buf[0], len, angleInDegrees ); - if( mag ) - for( k = 0; k < len; k++ ) - { - double m = mag[k]; - x[k] = buf[0][k]*m; y[k] = buf[1][k]*m; - } - else - { - for( k = 0; k < len; k++ ) - { - x[k] = buf[0][k]; - y[k] = buf[1][k]; - } - } + hal::polarToCart64f( mag, angle, x, y, len, angleInDegrees ); } if( ptrs[0] ) diff --git a/modules/core/src/mathfuncs_core.dispatch.cpp b/modules/core/src/mathfuncs_core.dispatch.cpp index 485eac27b4..84e4e6a652 100644 --- a/modules/core/src/mathfuncs_core.dispatch.cpp +++ b/modules/core/src/mathfuncs_core.dispatch.cpp @@ -29,6 +29,26 @@ void cartToPolar64f(const double* x, const double* y, double* mag, double* angle CV_CPU_DISPATCH_MODES_ALL); } +void polarToCart32f(const float* mag, const float* angle, float* x, float* y, int len, bool angleInDegrees) +{ + CV_INSTRUMENT_REGION(); + + CALL_HAL(polarToCart32f, cv_hal_polarToCart32f, mag, angle, x, y, len, angleInDegrees); + + CV_CPU_DISPATCH(polarToCart32f, (mag, angle, x, y, len, angleInDegrees), + CV_CPU_DISPATCH_MODES_ALL); +} + +void polarToCart64f(const double* mag, const double* angle, double* x, double* y, int len, bool angleInDegrees) +{ + CV_INSTRUMENT_REGION(); + + CALL_HAL(polarToCart64f, cv_hal_polarToCart64f, mag, angle, x, y, len, angleInDegrees); + + CV_CPU_DISPATCH(polarToCart64f, (mag, angle, x, y, len, angleInDegrees), + CV_CPU_DISPATCH_MODES_ALL); +} + void fastAtan32f(const float *Y, const float *X, float *angle, int len, bool angleInDegrees ) { CV_INSTRUMENT_REGION(); diff --git a/modules/core/src/mathfuncs_core.simd.hpp b/modules/core/src/mathfuncs_core.simd.hpp index e9d57a5c4d..d9289ecb4e 100644 --- a/modules/core/src/mathfuncs_core.simd.hpp +++ b/modules/core/src/mathfuncs_core.simd.hpp @@ -11,6 +11,8 @@ CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN // forward declarations void cartToPolar32f(const float *X, const float *Y, float* mag, float *angle, int len, bool angleInDegrees); void cartToPolar64f(const double *X, const double *Y, double* mag, double *angle, int len, bool angleInDegrees); +void polarToCart32f(const float *mag, const float *angle, float *X, float *Y, int len, bool angleInDegrees); +void polarToCart64f(const double *mag, const double *angle, double *X, double *Y, int len, bool angleInDegrees); void fastAtan32f(const float *Y, const float *X, float *angle, int len, bool angleInDegrees); void fastAtan64f(const double *Y, const double *X, double *angle, int len, bool angleInDegrees); void fastAtan2(const float *Y, const float *X, float *angle, int len, bool angleInDegrees); @@ -178,6 +180,167 @@ void cartToPolar64f(const double *X, const double *Y, double *mag, double *angle } } +namespace { + +static inline void SinCos_32f(const float* mag, const float* angle, float* cosval, float* sinval, int len, int angle_in_degrees) +{ + const int N = 64; + + static const double sin_table[] = + { + 0.00000000000000000000, 0.09801714032956060400, + 0.19509032201612825000, 0.29028467725446233000, + 0.38268343236508978000, 0.47139673682599764000, + 0.55557023301960218000, 0.63439328416364549000, + 0.70710678118654746000, 0.77301045336273699000, + 0.83146961230254524000, 0.88192126434835494000, + 0.92387953251128674000, 0.95694033573220894000, + 0.98078528040323043000, 0.99518472667219682000, + 1.00000000000000000000, 0.99518472667219693000, + 0.98078528040323043000, 0.95694033573220894000, + 0.92387953251128674000, 0.88192126434835505000, + 0.83146961230254546000, 0.77301045336273710000, + 0.70710678118654757000, 0.63439328416364549000, + 0.55557023301960218000, 0.47139673682599786000, + 0.38268343236508989000, 0.29028467725446239000, + 0.19509032201612861000, 0.09801714032956082600, + 0.00000000000000012246, -0.09801714032956059000, + -0.19509032201612836000, -0.29028467725446211000, + -0.38268343236508967000, -0.47139673682599764000, + -0.55557023301960196000, -0.63439328416364527000, + -0.70710678118654746000, -0.77301045336273666000, + -0.83146961230254524000, -0.88192126434835494000, + -0.92387953251128652000, -0.95694033573220882000, + -0.98078528040323032000, -0.99518472667219693000, + -1.00000000000000000000, -0.99518472667219693000, + -0.98078528040323043000, -0.95694033573220894000, + -0.92387953251128663000, -0.88192126434835505000, + -0.83146961230254546000, -0.77301045336273688000, + -0.70710678118654768000, -0.63439328416364593000, + -0.55557023301960218000, -0.47139673682599792000, + -0.38268343236509039000, -0.29028467725446250000, + -0.19509032201612872000, -0.09801714032956050600, + }; + + static const double k2 = (2*CV_PI)/N; + + static const double sin_a0 = -0.166630293345647*k2*k2*k2; + static const double sin_a2 = k2; + + static const double cos_a0 = -0.499818138450326*k2*k2; + /*static const double cos_a2 = 1;*/ + + double k1; + int i = 0; + + if( !angle_in_degrees ) + k1 = N/(2*CV_PI); + else + k1 = N/360.; + +#if (CV_SIMD || CV_SIMD_SCALABLE) + const int VECSZ = VTraits::vlanes(); + const v_float32 scale = vx_setall_f32(angle_in_degrees ? (float)CV_PI / 180.f : 1.f); + + for( ; i < len; i += VECSZ*2 ) + { + if( i + VECSZ*2 > len ) + { + // if it's inplace operation, we cannot repeatedly process + // the tail for the second time, so we have to use the + // scalar code + if( i == 0 || angle == cosval || angle == sinval || mag == cosval || mag == sinval ) + break; + i = len - VECSZ*2; + } + + v_float32 r0 = v_mul(vx_load(angle + i), scale); + v_float32 r1 = v_mul(vx_load(angle + i + VECSZ), scale); + + v_float32 c0, c1, s0, s1; + v_sincos(r0, s0, c0); + v_sincos(r1, s1, c1); + + if( mag ) + { + v_float32 m0 = vx_load(mag + i); + v_float32 m1 = vx_load(mag + i + VECSZ); + c0 = v_mul(c0, m0); + c1 = v_mul(c1, m1); + s0 = v_mul(s0, m0); + s1 = v_mul(s1, m1); + } + + v_store(cosval + i, c0); + v_store(cosval + i + VECSZ, c1); + + v_store(sinval + i, s0); + v_store(sinval + i + VECSZ, s1); + } + vx_cleanup(); +#endif + + for( ; i < len; i++ ) + { + double t = angle[i]*k1; + int it = cvRound(t); + t -= it; + int sin_idx = it & (N - 1); + int cos_idx = (N/4 - sin_idx) & (N - 1); + + double sin_b = (sin_a0*t*t + sin_a2)*t; + double cos_b = cos_a0*t*t + 1; + + double sin_a = sin_table[sin_idx]; + double cos_a = sin_table[cos_idx]; + + double sin_val = sin_a*cos_b + cos_a*sin_b; + double cos_val = cos_a*cos_b - sin_a*sin_b; + + if (mag) + { + double mag_val = mag[i]; + sin_val *= mag_val; + cos_val *= mag_val; + } + + sinval[i] = (float)sin_val; + cosval[i] = (float)cos_val; + } +} + +} // anonymous:: + +void polarToCart32f(const float *mag, const float *angle, float *X, float *Y, int len, bool angleInDegrees) +{ + CV_INSTRUMENT_REGION(); + SinCos_32f(mag, angle, X, Y, len, angleInDegrees); +} + +void polarToCart64f(const double *mag, const double *angle, double *X, double *Y, int len, bool angleInDegrees) +{ + CV_INSTRUMENT_REGION(); + + const int BLKSZ = 128; + float ybuf[BLKSZ], xbuf[BLKSZ], _mbuf[BLKSZ], abuf[BLKSZ]; + float* mbuf = mag ? _mbuf : nullptr; + for( int i = 0; i < len; i += BLKSZ ) + { + int j, blksz = std::min(BLKSZ, len - i); + for( j = 0; j < blksz; j++ ) + { + if (mbuf) + mbuf[j] = (float)mag[i + j]; + abuf[j] = (float)angle[i + j]; + } + SinCos_32f(mbuf, abuf, xbuf, ybuf, blksz, angleInDegrees); + for( j = 0; j < blksz; j++ ) + X[i + j] = xbuf[j]; + for( j = 0; j < blksz; j++ ) + Y[i + j] = ybuf[j]; + } +} + static void fastAtan32f_(const float *Y, const float *X, float *angle, int len, bool angleInDegrees ) { float scale = angleInDegrees ? 1.f : (float)(CV_PI/180);