diff --git a/modules/core/include/opencv2/core/hal/intrin.hpp b/modules/core/include/opencv2/core/hal/intrin.hpp index 28fe6a02a9..781bd045a5 100644 --- a/modules/core/include/opencv2/core/hal/intrin.hpp +++ b/modules/core/include/opencv2/core/hal/intrin.hpp @@ -191,6 +191,19 @@ CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(double, int64, uint64, double, void, double) #endif // CV_CPU_OPTIMIZATION_HAL_NAMESPACE CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN + +template inline _VecTp v_setzero_(); +template inline _VecTp v_setall_(uchar); +template inline _VecTp v_setall_(schar); +template inline _VecTp v_setall_(ushort); +template inline _VecTp v_setall_(short); +template inline _VecTp v_setall_(unsigned); +template inline _VecTp v_setall_(int); +template inline _VecTp v_setall_(uint64); +template inline _VecTp v_setall_(int64); +template inline _VecTp v_setall_(float); +template inline _VecTp v_setall_(double); + CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE; #endif @@ -958,7 +971,6 @@ namespace CV__SIMD_NAMESPACE { #define CV_SIMD 0 #endif -#include "intrin_math.hpp" #include "simd_utils.impl.hpp" #ifndef CV_DOXYGEN diff --git a/modules/core/include/opencv2/core/hal/intrin_avx.hpp b/modules/core/include/opencv2/core/hal/intrin_avx.hpp index e204050625..6e415f1e75 100644 --- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp @@ -447,6 +447,10 @@ OPENCV_HAL_IMPL_AVX_LOADSTORE_FLT(v_float64x4, double, pd, __m128d) { return _Tpvec(_mm256_setzero_si256()); } \ inline _Tpvec v256_setall_##suffix(_Tp v) \ { return _Tpvec(_mm256_set1_##ssuffix((ctype_s)v)); } \ + template <> inline _Tpvec v_setzero_() \ + { return v256_setzero_##suffix(); } \ + template <> inline _Tpvec v_setall_(_Tp v) \ + { return v256_setall_##suffix(v); } \ OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint8x32, suffix, OPENCV_HAL_NOP) \ OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int8x32, suffix, OPENCV_HAL_NOP) \ OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint16x16, suffix, OPENCV_HAL_NOP) \ @@ -472,6 +476,10 @@ OPENCV_HAL_IMPL_AVX_INIT(v_int64x4, int64, s64, epi64x, int64) { return _Tpvec(_mm256_setzero_##zsuffix()); } \ inline _Tpvec v256_setall_##suffix(_Tp v) \ { return _Tpvec(_mm256_set1_##zsuffix(v)); } \ + template <> inline _Tpvec v_setzero_() \ + { return v256_setzero_##suffix(); } \ + template <> inline _Tpvec v_setall_(_Tp v) \ + { return v256_setall_##suffix(v); } \ OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint8x32, suffix, cast) \ OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int8x32, suffix, cast) \ OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint16x16, suffix, cast) \ @@ -3158,6 +3166,14 @@ inline void v_pack_store(hfloat* ptr, const v_float32x8& a) inline void v256_cleanup() { _mm256_zeroall(); } +#include "intrin_math.hpp" +inline v_float32x8 v_exp(v_float32x8 x) { return v_exp_default_32f(x); } +inline v_float32x8 v_log(v_float32x8 x) { return v_log_default_32f(x); } +inline v_float32x8 v_erf(v_float32x8 x) { return v_erf_default_32f(x); } + +inline v_float64x4 v_exp(v_float64x4 x) { return v_exp_default_64f(x); } +inline v_float64x4 v_log(v_float64x4 x) { return v_log_default_64f(x); } + CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END //! @endcond diff --git a/modules/core/include/opencv2/core/hal/intrin_avx512.hpp b/modules/core/include/opencv2/core/hal/intrin_avx512.hpp index 64dab6b3ae..24007f4d16 100644 --- a/modules/core/include/opencv2/core/hal/intrin_avx512.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_avx512.hpp @@ -458,6 +458,10 @@ OPENCV_HAL_IMPL_AVX512_LOADSTORE_FLT(v_float64x8, double, pd, __m256d) { return _Tpvec(_mm512_setzero_si512()); } \ inline _Tpvec v512_setall_##suffix(_Tp v) \ { return _Tpvec(_mm512_set1_##ssuffix((ctype_s)v)); } \ + template <> inline _Tpvec v_setzero_() \ + { return v512_setzero_##suffix(); } \ + template <> inline _Tpvec v_setall_(_Tp v) \ + { return v512_setall_##suffix(v); } \ OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint8x64, suffix, OPENCV_HAL_NOP) \ OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int8x64, suffix, OPENCV_HAL_NOP) \ OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint16x32, suffix, OPENCV_HAL_NOP) \ @@ -483,6 +487,10 @@ OPENCV_HAL_IMPL_AVX512_INIT(v_int64x8, int64, s64, epi64, int64) { return _Tpvec(_mm512_setzero_##zsuffix()); } \ inline _Tpvec v512_setall_##suffix(_Tp v) \ { return _Tpvec(_mm512_set1_##zsuffix(v)); } \ + template <> inline _Tpvec v_setzero_() \ + { return v512_setzero_##suffix(); } \ + template <> inline _Tpvec v_setall_(_Tp v) \ + { return v512_setall_##suffix(v); } \ OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint8x64, suffix, cast) \ OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int8x64, suffix, cast) \ OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint16x32, suffix, cast) \ @@ -3070,6 +3078,14 @@ inline int v_scan_forward(const v_float64x8& a) { return trailingZeros32(v_signm inline void v512_cleanup() { _mm256_zeroall(); } +#include "intrin_math.hpp" +inline v_float32x16 v_exp(v_float32x16 x) { return v_exp_default_32f(x); } +inline v_float32x16 v_log(v_float32x16 x) { return v_log_default_32f(x); } +inline v_float32x16 v_erf(v_float32x16 x) { return v_erf_default_32f(x); } + +inline v_float64x8 v_exp(v_float64x8 x) { return v_exp_default_64f(x); } +inline v_float64x8 v_log(v_float64x8 x) { return v_log_default_64f(x); } + CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END //! @endcond diff --git a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp index ef1a33a630..653f51b145 100644 --- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp @@ -263,7 +263,7 @@ Most of these operations return only one value. ### Other math -- Some frequent operations: @ref v_sqrt, @ref v_invsqrt, @ref v_magnitude, @ref v_sqr_magnitude, @ref v_exp, +- Some frequent operations: @ref v_sqrt, @ref v_invsqrt, @ref v_magnitude, @ref v_sqr_magnitude, @ref v_exp, @ref v_log, @ref v_erf - Absolute values: @ref v_abs, @ref v_absdiff, @ref v_absdiffs @@ -2801,7 +2801,8 @@ inline void v_transpose4x4( v_reg<_Tp, n>& a0, const v_reg<_Tp, n>& a1, //! @brief Helper macro //! @ingroup core_hal_intrin_impl #define OPENCV_HAL_IMPL_C_INIT_ZERO(_Tpvec, prefix, suffix) \ -inline _Tpvec prefix##_setzero_##suffix() { return _Tpvec::zero(); } +inline _Tpvec prefix##_setzero_##suffix() { return _Tpvec::zero(); } \ +template <> inline _Tpvec v_setzero_() { return _Tpvec::zero(); } //! @name Init with zero //! @{ @@ -2847,7 +2848,8 @@ OPENCV_HAL_IMPL_C_INIT_ZERO(v_int64x8, v512, s64) //! @brief Helper macro //! @ingroup core_hal_intrin_impl #define OPENCV_HAL_IMPL_C_INIT_VAL(_Tpvec, _Tp, prefix, suffix) \ -inline _Tpvec prefix##_setall_##suffix(_Tp val) { return _Tpvec::all(val); } +inline _Tpvec prefix##_setall_##suffix(_Tp val) { return _Tpvec::all(val); } \ +template <> inline _Tpvec v_setall_(_Tp val) { return _Tpvec::all(val); } //! @name Init with value //! @{ diff --git a/modules/core/include/opencv2/core/hal/intrin_lasx.hpp b/modules/core/include/opencv2/core/hal/intrin_lasx.hpp index 45f53de8a2..1163e65748 100644 --- a/modules/core/include/opencv2/core/hal/intrin_lasx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_lasx.hpp @@ -557,6 +557,10 @@ inline __m256i _lasx_256_castpd_si256(const __m256d& v) { return _Tpvec(__lasx_xvreplgr2vr_d(0)); } \ inline _Tpvec v256_setall_##suffix(_Tp v) \ { return _Tpvec(__lasx_xvreplgr2vr_##ssuffix((ctype_s)v)); } \ + template <> inline _Tpvec v_setzero_() \ + { return v256_setzero_##suffix(); } \ + template <> inline _Tpvec v_setall_(_Tp v) \ + { return v256_setall_##suffix(v); } \ OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint8x32, suffix, OPENCV_HAL_NOP) \ OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int8x32, suffix, OPENCV_HAL_NOP) \ OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint16x16, suffix, OPENCV_HAL_NOP) \ @@ -588,7 +592,11 @@ inline __m256d _lasx_256_castsi256_pd(const __m256i &v) inline _Tpvec v256_setzero_##suffix() \ { return _Tpvec(__lasx_xvreplgr2vr_d(0)); } \ inline _Tpvec v256_setall_##suffix(_Tp v) \ - { return _Tpvec(_v256_setall_##zsuffix(v)); } \ + { return _Tpvec(_v256_setall_##zsuffix(v)); } \ + template <> inline _Tpvec v_setzero_() \ + { return v256_setzero_##suffix(); } \ + template <> inline _Tpvec v_setall_(_Tp v) \ + { return v256_setall_##suffix(v); } \ OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint8x32, suffix, cast) \ OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int8x32, suffix, cast) \ OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint16x16, suffix, cast) \ @@ -3005,6 +3013,14 @@ inline void v_pack_store(hfloat* ptr, const v_float32x8& a) inline void v256_cleanup() {} +#include "intrin_math.hpp" +inline v_float32x8 v_exp(v_float32x8 x) { return v_exp_default_32f(x); } +inline v_float32x8 v_log(v_float32x8 x) { return v_log_default_32f(x); } +inline v_float32x8 v_erf(v_float32x8 x) { return v_erf_default_32f(x); } + +inline v_float64x4 v_exp(v_float64x4 x) { return v_exp_default_64f(x); } +inline v_float64x4 v_log(v_float64x4 x) { return v_log_default_64f(x); } + CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END //! @endcond diff --git a/modules/core/include/opencv2/core/hal/intrin_lsx.hpp b/modules/core/include/opencv2/core/hal/intrin_lsx.hpp index aa997070c3..b2aeb4fcc2 100644 --- a/modules/core/include/opencv2/core/hal/intrin_lsx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_lsx.hpp @@ -417,6 +417,10 @@ inline __m128i _lsx_128_castpd_si128(const __m128d& v) { return _Tpvec(__lsx_vldi(0)); } \ inline _Tpvec v_setall_##suffix(_Tp v) \ { return _Tpvec(__lsx_vreplgr2vr_##ssuffix((ctype_s)v)); } \ + template <> inline _Tpvec v_setzero_() \ + { return v_setzero_##suffix(); } \ + template <> inline _Tpvec v_setall_(_Tp v) \ + { return v_setall_##suffix(v); } \ OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint8x16, suffix, OPENCV_HAL_NOP) \ OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int8x16, suffix, OPENCV_HAL_NOP) \ OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint16x8, suffix, OPENCV_HAL_NOP) \ @@ -448,6 +452,10 @@ inline __m128d _lsx_128_castsi128_pd(const __m128i &v) { return _Tpvec(__lsx_vldi(0)); } \ inline _Tpvec v_setall_##suffix(_Tp v) \ { return _Tpvec(_v128_setall_##zsuffix(v)); } \ + template <> inline _Tpvec v_setzero_() \ + { return v_setzero_##suffix(); } \ + template <> inline _Tpvec v_setall_(_Tp v) \ + { return v_setall_##suffix(v); } \ OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint8x16, suffix, cast) \ OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int8x16, suffix, cast) \ OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint16x8, suffix, cast) \ @@ -2515,6 +2523,14 @@ inline void v_pack_store(hfloat* ptr, const v_float32x4& a) inline void v_cleanup() {} +#include "intrin_math.hpp" +inline v_float32x4 v_exp(v_float32x4 x) { return v_exp_default_32f(x); } +inline v_float32x4 v_log(v_float32x4 x) { return v_log_default_32f(x); } +inline v_float32x4 v_erf(v_float32x4 x) { return v_erf_default_32f(x); } + +inline v_float64x2 v_exp(v_float64x2 x) { return v_exp_default_64f(x); } +inline v_float64x2 v_log(v_float64x2 x) { return v_log_default_64f(x); } + CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END //! @endcond diff --git a/modules/core/include/opencv2/core/hal/intrin_math.hpp b/modules/core/include/opencv2/core/hal/intrin_math.hpp index 06a4e27080..36aa90902e 100644 --- a/modules/core/include/opencv2/core/hal/intrin_math.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_math.hpp @@ -2,10 +2,6 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html -// This header is not standalone. Don't include directly, use "intrin.hpp" instead. -#ifdef OPENCV_HAL_INTRIN_HPP // defined in intrin.hpp - -namespace CV__SIMD_NAMESPACE { /* Universal Intrinsics implementation of sin, cos, exp and log @@ -34,434 +30,416 @@ namespace CV__SIMD_NAMESPACE { (this is the zlib license) */ - -#ifndef OPENCV_HAL_MATH_HAVE_EXP +#ifndef OPENCV_HAL_INTRIN_MATH_HPP +#define OPENCV_HAL_INTRIN_MATH_HPP //! @name Exponential //! @{ -#if defined(CV_SIMD_FP16) && CV_SIMD_FP16 - // Implementation is the same as float32 vector. - inline v_float16 v_exp(const v_float16 &x) { - const v_float16 _vexp_lo_f16 = vx_setall_f16(-10.7421875f); - const v_float16 _vexp_hi_f16 = vx_setall_f16(11.f); - const v_float16 _vexp_half_fp16 = vx_setall_f16(0.5f); - const v_float16 _vexp_one_fp16 = vx_setall_f16(1.f); - const v_float16 _vexp_LOG2EF_f16 = vx_setall_f16(1.44269504088896341f); - const v_float16 _vexp_C1_f16 = vx_setall_f16(-6.93359375E-1f); - const v_float16 _vexp_C2_f16 = vx_setall_f16(2.12194440E-4f); - const v_float16 _vexp_p0_f16 = vx_setall_f16(1.9875691500E-4f); - const v_float16 _vexp_p1_f16 = vx_setall_f16(1.3981999507E-3f); - const v_float16 _vexp_p2_f16 = vx_setall_f16(8.3334519073E-3f); - const v_float16 _vexp_p3_f16 = vx_setall_f16(4.1665795894E-2f); - const v_float16 _vexp_p4_f16 = vx_setall_f16(1.6666665459E-1f); - const v_float16 _vexp_p5_f16 = vx_setall_f16(5.0000001201E-1f); - const v_int16 _vexp_bias_s16 = vx_setall_s16(0xf); +// Implementation is the same as float32 vector. +template +inline _TpVec16F v_exp_default_16f(const _TpVec16F &x) { + const _TpVec16F _vexp_lo_f16 = v_setall_<_TpVec16F>(-10.7421875f); + const _TpVec16F _vexp_hi_f16 = v_setall_<_TpVec16F>(11.f); + const _TpVec16F _vexp_half_fp16 = v_setall_<_TpVec16F>(0.5f); + const _TpVec16F _vexp_one_fp16 = v_setall_<_TpVec16F>(1.f); + const _TpVec16F _vexp_LOG2EF_f16 = v_setall_<_TpVec16F>(1.44269504088896341f); + const _TpVec16F _vexp_C1_f16 = v_setall_<_TpVec16F>(-6.93359375E-1f); + const _TpVec16F _vexp_C2_f16 = v_setall_<_TpVec16F>(2.12194440E-4f); + const _TpVec16F _vexp_p0_f16 = v_setall_<_TpVec16F>(1.9875691500E-4f); + const _TpVec16F _vexp_p1_f16 = v_setall_<_TpVec16F>(1.3981999507E-3f); + const _TpVec16F _vexp_p2_f16 = v_setall_<_TpVec16F>(8.3334519073E-3f); + const _TpVec16F _vexp_p3_f16 = v_setall_<_TpVec16F>(4.1665795894E-2f); + const _TpVec16F _vexp_p4_f16 = v_setall_<_TpVec16F>(1.6666665459E-1f); + const _TpVec16F _vexp_p5_f16 = v_setall_<_TpVec16F>(5.0000001201E-1f); - v_float16 _vexp_, _vexp_x, _vexp_y, _vexp_xx; - v_int16 _vexp_mm; + _TpVec16F _vexp_, _vexp_x, _vexp_y, _vexp_xx; + _TpVec16S _vexp_mm; + const _TpVec16S _vexp_bias_s16 = v_setall_<_TpVec16S>((short)0xf); - // compute exponential of x - _vexp_x = v_max(x, _vexp_lo_f16); - _vexp_x = v_min(_vexp_x, _vexp_hi_f16); + // compute exponential of x + _vexp_x = v_max(x, _vexp_lo_f16); + _vexp_x = v_min(_vexp_x, _vexp_hi_f16); - _vexp_ = v_fma(_vexp_x, _vexp_LOG2EF_f16, _vexp_half_fp16); - _vexp_mm = v_floor(_vexp_); - _vexp_ = v_cvt_f16(_vexp_mm); - _vexp_mm = v_add(_vexp_mm, _vexp_bias_s16); - _vexp_mm = v_shl(_vexp_mm, 10); + _vexp_ = v_fma(_vexp_x, _vexp_LOG2EF_f16, _vexp_half_fp16); + _vexp_mm = v_floor(_vexp_); + _vexp_ = v_cvt_f16(_vexp_mm); + _vexp_mm = v_add(_vexp_mm, _vexp_bias_s16); + _vexp_mm = v_shl(_vexp_mm, 10); - _vexp_x = v_fma(_vexp_, _vexp_C1_f16, _vexp_x); - _vexp_x = v_fma(_vexp_, _vexp_C2_f16, _vexp_x); - _vexp_xx = v_mul(_vexp_x, _vexp_x); + _vexp_x = v_fma(_vexp_, _vexp_C1_f16, _vexp_x); + _vexp_x = v_fma(_vexp_, _vexp_C2_f16, _vexp_x); + _vexp_xx = v_mul(_vexp_x, _vexp_x); - _vexp_y = v_fma(_vexp_x, _vexp_p0_f16, _vexp_p1_f16); - _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p2_f16); - _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p3_f16); - _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p4_f16); - _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p5_f16); + _vexp_y = v_fma(_vexp_x, _vexp_p0_f16, _vexp_p1_f16); + _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p2_f16); + _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p3_f16); + _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p4_f16); + _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p5_f16); - _vexp_y = v_fma(_vexp_y, _vexp_xx, _vexp_x); - _vexp_y = v_add(_vexp_y, _vexp_one_fp16); - _vexp_y = v_mul(_vexp_y, v_reinterpret_as_f16(_vexp_mm)); + _vexp_y = v_fma(_vexp_y, _vexp_xx, _vexp_x); + _vexp_y = v_add(_vexp_y, _vexp_one_fp16); + _vexp_y = v_mul(_vexp_y, v_reinterpret_as_f16(_vexp_mm)); - // exp(NAN) -> NAN - v_float16 mask_not_nan = v_not_nan(x); - return v_select(mask_not_nan, _vexp_y, v_reinterpret_as_f16(vx_setall_s16(0x7e00))); - } -#endif + // exp(NAN) -> NAN + _TpVec16F mask_not_nan = v_not_nan(x); + return v_select(mask_not_nan, _vexp_y, v_reinterpret_as_f16(v_setall_<_TpVec16S>((short)0x7e00))); +} - inline v_float32 v_exp(const v_float32 &x) { - const v_float32 _vexp_lo_f32 = vx_setall_f32(-88.3762626647949f); - const v_float32 _vexp_hi_f32 = vx_setall_f32(89.f); - const v_float32 _vexp_half_fp32 = vx_setall_f32(0.5f); - const v_float32 _vexp_one_fp32 = vx_setall_f32(1.f); - const v_float32 _vexp_LOG2EF_f32 = vx_setall_f32(1.44269504088896341f); - const v_float32 _vexp_C1_f32 = vx_setall_f32(-6.93359375E-1f); - const v_float32 _vexp_C2_f32 = vx_setall_f32(2.12194440E-4f); - const v_float32 _vexp_p0_f32 = vx_setall_f32(1.9875691500E-4f); - const v_float32 _vexp_p1_f32 = vx_setall_f32(1.3981999507E-3f); - const v_float32 _vexp_p2_f32 = vx_setall_f32(8.3334519073E-3f); - const v_float32 _vexp_p3_f32 = vx_setall_f32(4.1665795894E-2f); - const v_float32 _vexp_p4_f32 = vx_setall_f32(1.6666665459E-1f); - const v_float32 _vexp_p5_f32 = vx_setall_f32(5.0000001201E-1f); - const v_int32 _vexp_bias_s32 = vx_setall_s32(0x7f); +template +inline _TpVec32F v_exp_default_32f(const _TpVec32F &x) { + const _TpVec32F _vexp_lo_f32 = v_setall_<_TpVec32F>(-88.3762626647949f); + const _TpVec32F _vexp_hi_f32 = v_setall_<_TpVec32F>(89.f); + const _TpVec32F _vexp_half_fp32 = v_setall_<_TpVec32F>(0.5f); + const _TpVec32F _vexp_one_fp32 = v_setall_<_TpVec32F>(1.f); + const _TpVec32F _vexp_LOG2EF_f32 = v_setall_<_TpVec32F>(1.44269504088896341f); + const _TpVec32F _vexp_C1_f32 = v_setall_<_TpVec32F>(-6.93359375E-1f); + const _TpVec32F _vexp_C2_f32 = v_setall_<_TpVec32F>(2.12194440E-4f); + const _TpVec32F _vexp_p0_f32 = v_setall_<_TpVec32F>(1.9875691500E-4f); + const _TpVec32F _vexp_p1_f32 = v_setall_<_TpVec32F>(1.3981999507E-3f); + const _TpVec32F _vexp_p2_f32 = v_setall_<_TpVec32F>(8.3334519073E-3f); + const _TpVec32F _vexp_p3_f32 = v_setall_<_TpVec32F>(4.1665795894E-2f); + const _TpVec32F _vexp_p4_f32 = v_setall_<_TpVec32F>(1.6666665459E-1f); + const _TpVec32F _vexp_p5_f32 = v_setall_<_TpVec32F>(5.0000001201E-1f); - v_float32 _vexp_, _vexp_x, _vexp_y, _vexp_xx; - v_int32 _vexp_mm; + _TpVec32F _vexp_, _vexp_x, _vexp_y, _vexp_xx; + _TpVec32S _vexp_mm; + const _TpVec32S _vexp_bias_s32 = v_setall_<_TpVec32S>((int)0x7f); - // compute exponential of x - _vexp_x = v_max(x, _vexp_lo_f32); - _vexp_x = v_min(_vexp_x, _vexp_hi_f32); + // compute exponential of x + _vexp_x = v_max(x, _vexp_lo_f32); + _vexp_x = v_min(_vexp_x, _vexp_hi_f32); - _vexp_ = v_fma(_vexp_x, _vexp_LOG2EF_f32, _vexp_half_fp32); - _vexp_mm = v_floor(_vexp_); - _vexp_ = v_cvt_f32(_vexp_mm); - _vexp_mm = v_add(_vexp_mm, _vexp_bias_s32); - _vexp_mm = v_shl(_vexp_mm, 23); + _vexp_ = v_fma(_vexp_x, _vexp_LOG2EF_f32, _vexp_half_fp32); + _vexp_mm = v_floor(_vexp_); + _vexp_ = v_cvt_f32(_vexp_mm); + _vexp_mm = v_add(_vexp_mm, _vexp_bias_s32); + _vexp_mm = v_shl(_vexp_mm, 23); - _vexp_x = v_fma(_vexp_, _vexp_C1_f32, _vexp_x); - _vexp_x = v_fma(_vexp_, _vexp_C2_f32, _vexp_x); - _vexp_xx = v_mul(_vexp_x, _vexp_x); + _vexp_x = v_fma(_vexp_, _vexp_C1_f32, _vexp_x); + _vexp_x = v_fma(_vexp_, _vexp_C2_f32, _vexp_x); + _vexp_xx = v_mul(_vexp_x, _vexp_x); - _vexp_y = v_fma(_vexp_x, _vexp_p0_f32, _vexp_p1_f32); - _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p2_f32); - _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p3_f32); - _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p4_f32); - _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p5_f32); + _vexp_y = v_fma(_vexp_x, _vexp_p0_f32, _vexp_p1_f32); + _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p2_f32); + _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p3_f32); + _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p4_f32); + _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p5_f32); - _vexp_y = v_fma(_vexp_y, _vexp_xx, _vexp_x); - _vexp_y = v_add(_vexp_y, _vexp_one_fp32); - _vexp_y = v_mul(_vexp_y, v_reinterpret_as_f32(_vexp_mm)); + _vexp_y = v_fma(_vexp_y, _vexp_xx, _vexp_x); + _vexp_y = v_add(_vexp_y, _vexp_one_fp32); + _vexp_y = v_mul(_vexp_y, v_reinterpret_as_f32(_vexp_mm)); - // exp(NAN) -> NAN - v_float32 mask_not_nan = v_not_nan(x); - return v_select(mask_not_nan, _vexp_y, v_reinterpret_as_f32(vx_setall_s32(0x7fc00000))); - } + // exp(NAN) -> NAN + _TpVec32F mask_not_nan = v_not_nan(x); + return v_select(mask_not_nan, _vexp_y, v_reinterpret_as_f32(v_setall_<_TpVec32S>((int)0x7fc00000))); +} -#if CV_SIMD_64F || CV_SIMD_SCALABLE_64F - inline v_float64 v_exp(const v_float64 &x) { - const v_float64 _vexp_lo_f64 = vx_setall_f64(-709.43613930310391424428); - const v_float64 _vexp_hi_f64 = vx_setall_f64(710.); - const v_float64 _vexp_half_f64 = vx_setall_f64(0.5); - const v_float64 _vexp_one_f64 = vx_setall_f64(1.0); - const v_float64 _vexp_two_f64 = vx_setall_f64(2.0); - const v_float64 _vexp_LOG2EF_f64 = vx_setall_f64(1.44269504088896340736); - const v_float64 _vexp_C1_f64 = vx_setall_f64(-6.93145751953125E-1); - const v_float64 _vexp_C2_f64 = vx_setall_f64(-1.42860682030941723212E-6); - const v_float64 _vexp_p0_f64 = vx_setall_f64(1.26177193074810590878E-4); - const v_float64 _vexp_p1_f64 = vx_setall_f64(3.02994407707441961300E-2); - const v_float64 _vexp_p2_f64 = vx_setall_f64(9.99999999999999999910E-1); - const v_float64 _vexp_q0_f64 = vx_setall_f64(3.00198505138664455042E-6); - const v_float64 _vexp_q1_f64 = vx_setall_f64(2.52448340349684104192E-3); - const v_float64 _vexp_q2_f64 = vx_setall_f64(2.27265548208155028766E-1); - const v_float64 _vexp_q3_f64 = vx_setall_f64(2.00000000000000000009E0); - const v_int64 _vexp_bias_s64 = vx_setall_s64(0x3ff); +template +inline _TpVec64F v_exp_default_64f(const _TpVec64F &x) { + const _TpVec64F _vexp_lo_f64 = v_setall_<_TpVec64F>(-709.43613930310391424428); + const _TpVec64F _vexp_hi_f64 = v_setall_<_TpVec64F>(710.); + const _TpVec64F _vexp_half_f64 = v_setall_<_TpVec64F>(0.5); + const _TpVec64F _vexp_one_f64 = v_setall_<_TpVec64F>(1.0); + const _TpVec64F _vexp_two_f64 = v_setall_<_TpVec64F>(2.0); + const _TpVec64F _vexp_LOG2EF_f64 = v_setall_<_TpVec64F>(1.44269504088896340736); + const _TpVec64F _vexp_C1_f64 = v_setall_<_TpVec64F>(-6.93145751953125E-1); + const _TpVec64F _vexp_C2_f64 = v_setall_<_TpVec64F>(-1.42860682030941723212E-6); + const _TpVec64F _vexp_p0_f64 = v_setall_<_TpVec64F>(1.26177193074810590878E-4); + const _TpVec64F _vexp_p1_f64 = v_setall_<_TpVec64F>(3.02994407707441961300E-2); + const _TpVec64F _vexp_p2_f64 = v_setall_<_TpVec64F>(9.99999999999999999910E-1); + const _TpVec64F _vexp_q0_f64 = v_setall_<_TpVec64F>(3.00198505138664455042E-6); + const _TpVec64F _vexp_q1_f64 = v_setall_<_TpVec64F>(2.52448340349684104192E-3); + const _TpVec64F _vexp_q2_f64 = v_setall_<_TpVec64F>(2.27265548208155028766E-1); + const _TpVec64F _vexp_q3_f64 = v_setall_<_TpVec64F>(2.00000000000000000009E0); - v_float64 _vexp_, _vexp_x, _vexp_y, _vexp_z, _vexp_xx; - v_int64 _vexp_mm; + _TpVec64F _vexp_, _vexp_x, _vexp_y, _vexp_z, _vexp_xx; + _TpVec64S _vexp_mm; + const _TpVec64S _vexp_bias_s64 = v_setall_<_TpVec64S>((int64)0x3ff); - // compute exponential of x - _vexp_x = v_max(x, _vexp_lo_f64); - _vexp_x = v_min(_vexp_x, _vexp_hi_f64); + // compute exponential of x + _vexp_x = v_max(x, _vexp_lo_f64); + _vexp_x = v_min(_vexp_x, _vexp_hi_f64); - _vexp_ = v_fma(_vexp_x, _vexp_LOG2EF_f64, _vexp_half_f64); - _vexp_mm = v_expand_low(v_floor(_vexp_)); - _vexp_ = v_cvt_f64(_vexp_mm); - _vexp_mm = v_add(_vexp_mm, _vexp_bias_s64); - _vexp_mm = v_shl(_vexp_mm, 52); + _vexp_ = v_fma(_vexp_x, _vexp_LOG2EF_f64, _vexp_half_f64); + _vexp_mm = v_expand_low(v_floor(_vexp_)); + _vexp_ = v_cvt_f64(_vexp_mm); + _vexp_mm = v_add(_vexp_mm, _vexp_bias_s64); + _vexp_mm = v_shl(_vexp_mm, 52); - _vexp_x = v_fma(_vexp_, _vexp_C1_f64, _vexp_x); - _vexp_x = v_fma(_vexp_, _vexp_C2_f64, _vexp_x); - _vexp_xx = v_mul(_vexp_x, _vexp_x); + _vexp_x = v_fma(_vexp_, _vexp_C1_f64, _vexp_x); + _vexp_x = v_fma(_vexp_, _vexp_C2_f64, _vexp_x); + _vexp_xx = v_mul(_vexp_x, _vexp_x); - _vexp_y = v_fma(_vexp_xx, _vexp_p0_f64, _vexp_p1_f64); - _vexp_y = v_fma(_vexp_y, _vexp_xx, _vexp_p2_f64); - _vexp_y = v_mul(_vexp_y, _vexp_x); + _vexp_y = v_fma(_vexp_xx, _vexp_p0_f64, _vexp_p1_f64); + _vexp_y = v_fma(_vexp_y, _vexp_xx, _vexp_p2_f64); + _vexp_y = v_mul(_vexp_y, _vexp_x); - _vexp_z = v_fma(_vexp_xx, _vexp_q0_f64, _vexp_q1_f64); - _vexp_z = v_fma(_vexp_xx, _vexp_z, _vexp_q2_f64); - _vexp_z = v_fma(_vexp_xx, _vexp_z, _vexp_q3_f64); + _vexp_z = v_fma(_vexp_xx, _vexp_q0_f64, _vexp_q1_f64); + _vexp_z = v_fma(_vexp_xx, _vexp_z, _vexp_q2_f64); + _vexp_z = v_fma(_vexp_xx, _vexp_z, _vexp_q3_f64); - _vexp_z = v_div(_vexp_y, v_sub(_vexp_z, _vexp_y)); - _vexp_z = v_fma(_vexp_two_f64, _vexp_z, _vexp_one_f64); - _vexp_z = v_mul(_vexp_z, v_reinterpret_as_f64(_vexp_mm)); + _vexp_z = v_div(_vexp_y, v_sub(_vexp_z, _vexp_y)); + _vexp_z = v_fma(_vexp_two_f64, _vexp_z, _vexp_one_f64); + _vexp_z = v_mul(_vexp_z, v_reinterpret_as_f64(_vexp_mm)); - // exp(NAN) -> NAN - v_float64 mask_not_nan = v_not_nan(x); - return v_select(mask_not_nan, _vexp_z, v_reinterpret_as_f64(vx_setall_s64(0x7FF8000000000000))); - } -#endif - -#define OPENCV_HAL_MATH_HAVE_EXP 1 + // exp(NAN) -> NAN + _TpVec64F mask_not_nan = v_not_nan(x); + return v_select(mask_not_nan, _vexp_z, v_reinterpret_as_f64(v_setall_<_TpVec64S>((int64)0x7FF8000000000000))); +} //! @} -#endif - -#ifndef OPENCV_HAL_MATH_HAVE_LOG //! @name Natural Logarithm //! @{ -#if defined(CV_SIMD_FP16) && CV_SIMD_FP16 - inline v_float16 v_log(const v_float16 &x) { - const v_float16 _vlog_one_fp16 = vx_setall_f16(1.0f); - const v_float16 _vlog_SQRTHF_fp16 = vx_setall_f16(0.707106781186547524f); - const v_float16 _vlog_q1_fp16 = vx_setall_f16(-2.12194440E-4f); - const v_float16 _vlog_q2_fp16 = vx_setall_f16(0.693359375f); - const v_float16 _vlog_p0_fp16 = vx_setall_f16(7.0376836292E-2f); - const v_float16 _vlog_p1_fp16 = vx_setall_f16(-1.1514610310E-1f); - const v_float16 _vlog_p2_fp16 = vx_setall_f16(1.1676998740E-1f); - const v_float16 _vlog_p3_fp16 = vx_setall_f16(-1.2420140846E-1f); - const v_float16 _vlog_p4_fp16 = vx_setall_f16(1.4249322787E-1f); - const v_float16 _vlog_p5_fp16 = vx_setall_f16(-1.6668057665E-1f); - const v_float16 _vlog_p6_fp16 = vx_setall_f16(2.0000714765E-1f); - const v_float16 _vlog_p7_fp16 = vx_setall_f16(-2.4999993993E-1f); - const v_float16 _vlog_p8_fp16 = vx_setall_f16(3.3333331174E-1f); - const v_int16 _vlog_inv_mant_mask_s16 = vx_setall_s16(~0x7c00); +template +inline _TpVec16F v_log_default_16f(const _TpVec16F &x) { + const _TpVec16F _vlog_one_fp16 = v_setall_<_TpVec16F>(1.0f); + const _TpVec16F _vlog_SQRTHF_fp16 = v_setall_<_TpVec16F>(0.707106781186547524f); + const _TpVec16F _vlog_q1_fp16 = v_setall_<_TpVec16F>(-2.12194440E-4f); + const _TpVec16F _vlog_q2_fp16 = v_setall_<_TpVec16F>(0.693359375f); + const _TpVec16F _vlog_p0_fp16 = v_setall_<_TpVec16F>(7.0376836292E-2f); + const _TpVec16F _vlog_p1_fp16 = v_setall_<_TpVec16F>(-1.1514610310E-1f); + const _TpVec16F _vlog_p2_fp16 = v_setall_<_TpVec16F>(1.1676998740E-1f); + const _TpVec16F _vlog_p3_fp16 = v_setall_<_TpVec16F>(-1.2420140846E-1f); + const _TpVec16F _vlog_p4_fp16 = v_setall_<_TpVec16F>(1.4249322787E-1f); + const _TpVec16F _vlog_p5_fp16 = v_setall_<_TpVec16F>(-1.6668057665E-1f); + const _TpVec16F _vlog_p6_fp16 = v_setall_<_TpVec16F>(2.0000714765E-1f); + const _TpVec16F _vlog_p7_fp16 = v_setall_<_TpVec16F>(-2.4999993993E-1f); + const _TpVec16F _vlog_p8_fp16 = v_setall_<_TpVec16F>(3.3333331174E-1f); - v_float16 _vlog_x, _vlog_e, _vlog_y, _vlog_z, _vlog_tmp; - v_int16 _vlog_ux, _vlog_emm0; + _TpVec16F _vlog_x, _vlog_e, _vlog_y, _vlog_z, _vlog_tmp; + _TpVec16S _vlog_ux, _vlog_emm0; + const _TpVec16S _vlog_inv_mant_mask_s16 = v_setall_<_TpVec16S>((short)~0x7c00); - _vlog_ux = v_reinterpret_as_s16(x); - _vlog_emm0 = v_shr(_vlog_ux, 10); + _vlog_ux = v_reinterpret_as_s16(x); + _vlog_emm0 = v_shr(_vlog_ux, 10); - _vlog_ux = v_and(_vlog_ux, _vlog_inv_mant_mask_s16); - _vlog_ux = v_or(_vlog_ux, v_reinterpret_as_s16(vx_setall_f16(0.5f))); - _vlog_x = v_reinterpret_as_f16(_vlog_ux); + _vlog_ux = v_and(_vlog_ux, _vlog_inv_mant_mask_s16); + _vlog_ux = v_or(_vlog_ux, v_reinterpret_as_s16(v_setall_<_TpVec16F>(0.5f))); + _vlog_x = v_reinterpret_as_f16(_vlog_ux); - _vlog_emm0 = v_sub(_vlog_emm0, vx_setall_s16(0xf)); - _vlog_e = v_cvt_f16(_vlog_emm0); + _vlog_emm0 = v_sub(_vlog_emm0, v_setall_<_TpVec16S>((short)0xf)); + _vlog_e = v_cvt_f16(_vlog_emm0); - _vlog_e = v_add(_vlog_e, _vlog_one_fp16); + _vlog_e = v_add(_vlog_e, _vlog_one_fp16); - v_float16 _vlog_mask = v_lt(_vlog_x, _vlog_SQRTHF_fp16); - _vlog_tmp = v_and(_vlog_x, _vlog_mask); - _vlog_x = v_sub(_vlog_x, _vlog_one_fp16); - _vlog_e = v_sub(_vlog_e, v_and(_vlog_one_fp16, _vlog_mask)); - _vlog_x = v_add(_vlog_x, _vlog_tmp); + _TpVec16F _vlog_mask = v_lt(_vlog_x, _vlog_SQRTHF_fp16); + _vlog_tmp = v_and(_vlog_x, _vlog_mask); + _vlog_x = v_sub(_vlog_x, _vlog_one_fp16); + _vlog_e = v_sub(_vlog_e, v_and(_vlog_one_fp16, _vlog_mask)); + _vlog_x = v_add(_vlog_x, _vlog_tmp); - _vlog_z = v_mul(_vlog_x, _vlog_x); + _vlog_z = v_mul(_vlog_x, _vlog_x); - _vlog_y = v_fma(_vlog_p0_fp16, _vlog_x, _vlog_p1_fp16); - _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p2_fp16); - _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p3_fp16); - _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p4_fp16); - _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p5_fp16); - _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p6_fp16); - _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p7_fp16); - _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p8_fp16); - _vlog_y = v_mul(_vlog_y, _vlog_x); - _vlog_y = v_mul(_vlog_y, _vlog_z); + _vlog_y = v_fma(_vlog_p0_fp16, _vlog_x, _vlog_p1_fp16); + _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p2_fp16); + _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p3_fp16); + _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p4_fp16); + _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p5_fp16); + _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p6_fp16); + _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p7_fp16); + _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p8_fp16); + _vlog_y = v_mul(_vlog_y, _vlog_x); + _vlog_y = v_mul(_vlog_y, _vlog_z); - _vlog_y = v_fma(_vlog_e, _vlog_q1_fp16, _vlog_y); + _vlog_y = v_fma(_vlog_e, _vlog_q1_fp16, _vlog_y); - _vlog_y = v_sub(_vlog_y, v_mul(_vlog_z, vx_setall_f16(0.5f))); + _vlog_y = v_sub(_vlog_y, v_mul(_vlog_z, v_setall_<_TpVec16F>(0.5f))); - _vlog_x = v_add(_vlog_x, _vlog_y); - _vlog_x = v_fma(_vlog_e, _vlog_q2_fp16, _vlog_x); - // log(0) -> -INF - v_float16 mask_zero = v_eq(x, vx_setzero_f16()); - _vlog_x = v_select(mask_zero, v_reinterpret_as_f16(vx_setall_s16(0xfc00)), _vlog_x); - // log(NEG), log(NAN) -> NAN - v_float16 mask_not_nan = v_ge(x, vx_setzero_f16()); - _vlog_x = v_select(mask_not_nan, _vlog_x, v_reinterpret_as_f16(vx_setall_s16(0x7e00))); - // log(INF) -> INF - v_float16 mask_inf = v_eq(x, v_reinterpret_as_f16(vx_setall_s16(0x7c00))); - _vlog_x = v_select(mask_inf, x, _vlog_x); - return _vlog_x; - } -#endif + _vlog_x = v_add(_vlog_x, _vlog_y); + _vlog_x = v_fma(_vlog_e, _vlog_q2_fp16, _vlog_x); + // log(0) -> -INF + _TpVec16F mask_zero = v_eq(x, v_setzero_<_TpVec16F>()); + _vlog_x = v_select(mask_zero, v_reinterpret_as_f16(v_setall_<_TpVec16S>((short)0xfc00)), _vlog_x); + // log(NEG), log(NAN) -> NAN + _TpVec16F mask_not_nan = v_ge(x, v_setzero_<_TpVec16F>()); + _vlog_x = v_select(mask_not_nan, _vlog_x, v_reinterpret_as_f16(v_setall_<_TpVec16S>((short)0x7e00))); + // log(INF) -> INF + _TpVec16F mask_inf = v_eq(x, v_reinterpret_as_f16(v_setall_<_TpVec16S>((short)0x7c00))); + _vlog_x = v_select(mask_inf, x, _vlog_x); + return _vlog_x; +} - inline v_float32 v_log(const v_float32 &x) { - const v_float32 _vlog_one_fp32 = vx_setall_f32(1.0f); - const v_float32 _vlog_SQRTHF_fp32 = vx_setall_f32(0.707106781186547524f); - const v_float32 _vlog_q1_fp32 = vx_setall_f32(-2.12194440E-4f); - const v_float32 _vlog_q2_fp32 = vx_setall_f32(0.693359375f); - const v_float32 _vlog_p0_fp32 = vx_setall_f32(7.0376836292E-2f); - const v_float32 _vlog_p1_fp32 = vx_setall_f32(-1.1514610310E-1f); - const v_float32 _vlog_p2_fp32 = vx_setall_f32(1.1676998740E-1f); - const v_float32 _vlog_p3_fp32 = vx_setall_f32(-1.2420140846E-1f); - const v_float32 _vlog_p4_fp32 = vx_setall_f32(1.4249322787E-1f); - const v_float32 _vlog_p5_fp32 = vx_setall_f32(-1.6668057665E-1f); - const v_float32 _vlog_p6_fp32 = vx_setall_f32(2.0000714765E-1f); - const v_float32 _vlog_p7_fp32 = vx_setall_f32(-2.4999993993E-1f); - const v_float32 _vlog_p8_fp32 = vx_setall_f32(3.3333331174E-1f); - const v_int32 _vlog_inv_mant_mask_s32 = vx_setall_s32(~0x7f800000); +template +inline _TpVec32F v_log_default_32f(const _TpVec32F &x) { + const _TpVec32F _vlog_one_fp32 = v_setall_<_TpVec32F>(1.0f); + const _TpVec32F _vlog_SQRTHF_fp32 = v_setall_<_TpVec32F>(0.707106781186547524f); + const _TpVec32F _vlog_q1_fp32 = v_setall_<_TpVec32F>(-2.12194440E-4f); + const _TpVec32F _vlog_q2_fp32 = v_setall_<_TpVec32F>(0.693359375f); + const _TpVec32F _vlog_p0_fp32 = v_setall_<_TpVec32F>(7.0376836292E-2f); + const _TpVec32F _vlog_p1_fp32 = v_setall_<_TpVec32F>(-1.1514610310E-1f); + const _TpVec32F _vlog_p2_fp32 = v_setall_<_TpVec32F>(1.1676998740E-1f); + const _TpVec32F _vlog_p3_fp32 = v_setall_<_TpVec32F>(-1.2420140846E-1f); + const _TpVec32F _vlog_p4_fp32 = v_setall_<_TpVec32F>(1.4249322787E-1f); + const _TpVec32F _vlog_p5_fp32 = v_setall_<_TpVec32F>(-1.6668057665E-1f); + const _TpVec32F _vlog_p6_fp32 = v_setall_<_TpVec32F>(2.0000714765E-1f); + const _TpVec32F _vlog_p7_fp32 = v_setall_<_TpVec32F>(-2.4999993993E-1f); + const _TpVec32F _vlog_p8_fp32 = v_setall_<_TpVec32F>(3.3333331174E-1f); - v_float32 _vlog_x, _vlog_e, _vlog_y, _vlog_z, _vlog_tmp; - v_int32 _vlog_ux, _vlog_emm0; + _TpVec32F _vlog_x, _vlog_e, _vlog_y, _vlog_z, _vlog_tmp; + _TpVec32S _vlog_ux, _vlog_emm0; + const _TpVec32S _vlog_inv_mant_mask_s32 = v_setall_<_TpVec32S>((int)~0x7f800000); - _vlog_ux = v_reinterpret_as_s32(x); - _vlog_emm0 = v_shr(_vlog_ux, 23); + _vlog_ux = v_reinterpret_as_s32(x); + _vlog_emm0 = v_shr(_vlog_ux, 23); - _vlog_ux = v_and(_vlog_ux, _vlog_inv_mant_mask_s32); - _vlog_ux = v_or(_vlog_ux, v_reinterpret_as_s32(vx_setall_f32(0.5f))); - _vlog_x = v_reinterpret_as_f32(_vlog_ux); + _vlog_ux = v_and(_vlog_ux, _vlog_inv_mant_mask_s32); + _vlog_ux = v_or(_vlog_ux, v_reinterpret_as_s32(v_setall_<_TpVec32F>(0.5f))); + _vlog_x = v_reinterpret_as_f32(_vlog_ux); - _vlog_emm0 = v_sub(_vlog_emm0, vx_setall_s32(0x7f)); - _vlog_e = v_cvt_f32(_vlog_emm0); + _vlog_emm0 = v_sub(_vlog_emm0, v_setall_<_TpVec32S>((int)0x7f)); + _vlog_e = v_cvt_f32(_vlog_emm0); - _vlog_e = v_add(_vlog_e, _vlog_one_fp32); + _vlog_e = v_add(_vlog_e, _vlog_one_fp32); - v_float32 _vlog_mask = v_lt(_vlog_x, _vlog_SQRTHF_fp32); - _vlog_tmp = v_and(_vlog_x, _vlog_mask); - _vlog_x = v_sub(_vlog_x, _vlog_one_fp32); - _vlog_e = v_sub(_vlog_e, v_and(_vlog_one_fp32, _vlog_mask)); - _vlog_x = v_add(_vlog_x, _vlog_tmp); + _TpVec32F _vlog_mask = v_lt(_vlog_x, _vlog_SQRTHF_fp32); + _vlog_tmp = v_and(_vlog_x, _vlog_mask); + _vlog_x = v_sub(_vlog_x, _vlog_one_fp32); + _vlog_e = v_sub(_vlog_e, v_and(_vlog_one_fp32, _vlog_mask)); + _vlog_x = v_add(_vlog_x, _vlog_tmp); - _vlog_z = v_mul(_vlog_x, _vlog_x); + _vlog_z = v_mul(_vlog_x, _vlog_x); - _vlog_y = v_fma(_vlog_p0_fp32, _vlog_x, _vlog_p1_fp32); - _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p2_fp32); - _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p3_fp32); - _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p4_fp32); - _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p5_fp32); - _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p6_fp32); - _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p7_fp32); - _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p8_fp32); - _vlog_y = v_mul(_vlog_y, _vlog_x); - _vlog_y = v_mul(_vlog_y, _vlog_z); + _vlog_y = v_fma(_vlog_p0_fp32, _vlog_x, _vlog_p1_fp32); + _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p2_fp32); + _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p3_fp32); + _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p4_fp32); + _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p5_fp32); + _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p6_fp32); + _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p7_fp32); + _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p8_fp32); + _vlog_y = v_mul(_vlog_y, _vlog_x); + _vlog_y = v_mul(_vlog_y, _vlog_z); - _vlog_y = v_fma(_vlog_e, _vlog_q1_fp32, _vlog_y); + _vlog_y = v_fma(_vlog_e, _vlog_q1_fp32, _vlog_y); - _vlog_y = v_sub(_vlog_y, v_mul(_vlog_z, vx_setall_f32(0.5))); + _vlog_y = v_sub(_vlog_y, v_mul(_vlog_z, v_setall_<_TpVec32F>(0.5f))); - _vlog_x = v_add(_vlog_x, _vlog_y); - _vlog_x = v_fma(_vlog_e, _vlog_q2_fp32, _vlog_x); - // log(0) -> -INF - v_float32 mask_zero = v_eq(x, vx_setzero_f32()); - _vlog_x = v_select(mask_zero, v_reinterpret_as_f32(vx_setall_s32(0xff800000)), _vlog_x); - // log(NEG), log(NAN) -> NAN - v_float32 mask_not_nan = v_ge(x, vx_setzero_f32()); - _vlog_x = v_select(mask_not_nan, _vlog_x, v_reinterpret_as_f32(vx_setall_s32(0x7fc00000))); - // log(INF) -> INF - v_float32 mask_inf = v_eq(x, v_reinterpret_as_f32(vx_setall_s32(0x7f800000))); - _vlog_x = v_select(mask_inf, x, _vlog_x); - return _vlog_x; - } + _vlog_x = v_add(_vlog_x, _vlog_y); + _vlog_x = v_fma(_vlog_e, _vlog_q2_fp32, _vlog_x); + // log(0) -> -INF + _TpVec32F mask_zero = v_eq(x, v_setzero_<_TpVec32F>()); + _vlog_x = v_select(mask_zero, v_reinterpret_as_f32(v_setall_<_TpVec32S>((int)0xff800000)), _vlog_x); + // log(NEG), log(NAN) -> NAN + _TpVec32F mask_not_nan = v_ge(x, v_setzero_<_TpVec32F>()); + _vlog_x = v_select(mask_not_nan, _vlog_x, v_reinterpret_as_f32(v_setall_<_TpVec32S>((int)0x7fc00000))); + // log(INF) -> INF + _TpVec32F mask_inf = v_eq(x, v_reinterpret_as_f32(v_setall_<_TpVec32S>((int)0x7f800000))); + _vlog_x = v_select(mask_inf, x, _vlog_x); + return _vlog_x; +} -#if CV_SIMD_64F || CV_SIMD_SCALABLE_64F - inline v_float64 v_log(const v_float64 &x) { - const v_float64 _vlog_one_fp64 = vx_setall_f64(1.0); - const v_float64 _vlog_SQRTHF_fp64 = vx_setall_f64(0.7071067811865475244); - const v_float64 _vlog_p0_fp64 = vx_setall_f64(1.01875663804580931796E-4); - const v_float64 _vlog_p1_fp64 = vx_setall_f64(4.97494994976747001425E-1); - const v_float64 _vlog_p2_fp64 = vx_setall_f64(4.70579119878881725854); - const v_float64 _vlog_p3_fp64 = vx_setall_f64(1.44989225341610930846E1); - const v_float64 _vlog_p4_fp64 = vx_setall_f64(1.79368678507819816313E1); - const v_float64 _vlog_p5_fp64 = vx_setall_f64(7.70838733755885391666); - const v_float64 _vlog_q0_fp64 = vx_setall_f64(1.12873587189167450590E1); - const v_float64 _vlog_q1_fp64 = vx_setall_f64(4.52279145837532221105E1); - const v_float64 _vlog_q2_fp64 = vx_setall_f64(8.29875266912776603211E1); - const v_float64 _vlog_q3_fp64 = vx_setall_f64(7.11544750618563894466E1); - const v_float64 _vlog_q4_fp64 = vx_setall_f64(2.31251620126765340583E1); +template +inline _TpVec64F v_log_default_64f(const _TpVec64F &x) { + const _TpVec64F _vlog_one_fp64 = v_setall_<_TpVec64F>(1.0); + const _TpVec64F _vlog_SQRTHF_fp64 = v_setall_<_TpVec64F>(0.7071067811865475244); + const _TpVec64F _vlog_p0_fp64 = v_setall_<_TpVec64F>(1.01875663804580931796E-4); + const _TpVec64F _vlog_p1_fp64 = v_setall_<_TpVec64F>(4.97494994976747001425E-1); + const _TpVec64F _vlog_p2_fp64 = v_setall_<_TpVec64F>(4.70579119878881725854); + const _TpVec64F _vlog_p3_fp64 = v_setall_<_TpVec64F>(1.44989225341610930846E1); + const _TpVec64F _vlog_p4_fp64 = v_setall_<_TpVec64F>(1.79368678507819816313E1); + const _TpVec64F _vlog_p5_fp64 = v_setall_<_TpVec64F>(7.70838733755885391666); + const _TpVec64F _vlog_q0_fp64 = v_setall_<_TpVec64F>(1.12873587189167450590E1); + const _TpVec64F _vlog_q1_fp64 = v_setall_<_TpVec64F>(4.52279145837532221105E1); + const _TpVec64F _vlog_q2_fp64 = v_setall_<_TpVec64F>(8.29875266912776603211E1); + const _TpVec64F _vlog_q3_fp64 = v_setall_<_TpVec64F>(7.11544750618563894466E1); + const _TpVec64F _vlog_q4_fp64 = v_setall_<_TpVec64F>(2.31251620126765340583E1); - const v_float64 _vlog_C0_fp64 = vx_setall_f64(2.121944400546905827679e-4); - const v_float64 _vlog_C1_fp64 = vx_setall_f64(0.693359375); - const v_int64 _vlog_inv_mant_mask_s64 = vx_setall_s64(~0x7ff0000000000000); + const _TpVec64F _vlog_C0_fp64 = v_setall_<_TpVec64F>(2.121944400546905827679e-4); + const _TpVec64F _vlog_C1_fp64 = v_setall_<_TpVec64F>(0.693359375); - v_float64 _vlog_x, _vlog_e, _vlog_y, _vlog_z, _vlog_tmp, _vlog_xx; - v_int64 _vlog_ux, _vlog_emm0; + _TpVec64F _vlog_x, _vlog_e, _vlog_y, _vlog_z, _vlog_tmp, _vlog_xx; + _TpVec64S _vlog_ux, _vlog_emm0; + const _TpVec64S _vlog_inv_mant_mask_s64 = v_setall_<_TpVec64S>((int64)~0x7ff0000000000000); - _vlog_ux = v_reinterpret_as_s64(x); - _vlog_emm0 = v_shr(_vlog_ux, 52); + _vlog_ux = v_reinterpret_as_s64(x); + _vlog_emm0 = v_shr(_vlog_ux, 52); - _vlog_ux = v_and(_vlog_ux, _vlog_inv_mant_mask_s64); - _vlog_ux = v_or(_vlog_ux, v_reinterpret_as_s64(vx_setall_f64(0.5))); - _vlog_x = v_reinterpret_as_f64(_vlog_ux); + _vlog_ux = v_and(_vlog_ux, _vlog_inv_mant_mask_s64); + _vlog_ux = v_or(_vlog_ux, v_reinterpret_as_s64(v_setall_<_TpVec64F>(0.5))); + _vlog_x = v_reinterpret_as_f64(_vlog_ux); - _vlog_emm0 = v_sub(_vlog_emm0, vx_setall_s64(0x3ff)); - _vlog_e = v_cvt_f64(_vlog_emm0); + _vlog_emm0 = v_sub(_vlog_emm0, v_setall_<_TpVec64S>((int64)0x3ff)); + _vlog_e = v_cvt_f64(_vlog_emm0); - _vlog_e = v_add(_vlog_e, _vlog_one_fp64); + _vlog_e = v_add(_vlog_e, _vlog_one_fp64); - v_float64 _vlog_mask = v_lt(_vlog_x, _vlog_SQRTHF_fp64); - _vlog_tmp = v_and(_vlog_x, _vlog_mask); - _vlog_x = v_sub(_vlog_x, _vlog_one_fp64); - _vlog_e = v_sub(_vlog_e, v_and(_vlog_one_fp64, _vlog_mask)); - _vlog_x = v_add(_vlog_x, _vlog_tmp); + _TpVec64F _vlog_mask = v_lt(_vlog_x, _vlog_SQRTHF_fp64); + _vlog_tmp = v_and(_vlog_x, _vlog_mask); + _vlog_x = v_sub(_vlog_x, _vlog_one_fp64); + _vlog_e = v_sub(_vlog_e, v_and(_vlog_one_fp64, _vlog_mask)); + _vlog_x = v_add(_vlog_x, _vlog_tmp); - _vlog_xx = v_mul(_vlog_x, _vlog_x); + _vlog_xx = v_mul(_vlog_x, _vlog_x); - _vlog_y = v_fma(_vlog_p0_fp64, _vlog_x, _vlog_p1_fp64); - _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p2_fp64); - _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p3_fp64); - _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p4_fp64); - _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p5_fp64); - _vlog_y = v_mul(_vlog_y, _vlog_x); - _vlog_y = v_mul(_vlog_y, _vlog_xx); + _vlog_y = v_fma(_vlog_p0_fp64, _vlog_x, _vlog_p1_fp64); + _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p2_fp64); + _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p3_fp64); + _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p4_fp64); + _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p5_fp64); + _vlog_y = v_mul(_vlog_y, _vlog_x); + _vlog_y = v_mul(_vlog_y, _vlog_xx); - _vlog_z = v_add(_vlog_x, _vlog_q0_fp64); - _vlog_z = v_fma(_vlog_z, _vlog_x, _vlog_q1_fp64); - _vlog_z = v_fma(_vlog_z, _vlog_x, _vlog_q2_fp64); - _vlog_z = v_fma(_vlog_z, _vlog_x, _vlog_q3_fp64); - _vlog_z = v_fma(_vlog_z, _vlog_x, _vlog_q4_fp64); + _vlog_z = v_add(_vlog_x, _vlog_q0_fp64); + _vlog_z = v_fma(_vlog_z, _vlog_x, _vlog_q1_fp64); + _vlog_z = v_fma(_vlog_z, _vlog_x, _vlog_q2_fp64); + _vlog_z = v_fma(_vlog_z, _vlog_x, _vlog_q3_fp64); + _vlog_z = v_fma(_vlog_z, _vlog_x, _vlog_q4_fp64); - _vlog_z = v_div(_vlog_y, _vlog_z); - _vlog_z = v_sub(_vlog_z, v_mul(_vlog_e, _vlog_C0_fp64)); - _vlog_z = v_sub(_vlog_z, v_mul(_vlog_xx, vx_setall_f64(0.5))); + _vlog_z = v_div(_vlog_y, _vlog_z); + _vlog_z = v_sub(_vlog_z, v_mul(_vlog_e, _vlog_C0_fp64)); + _vlog_z = v_sub(_vlog_z, v_mul(_vlog_xx, v_setall_<_TpVec64F>(0.5))); - _vlog_z = v_add(_vlog_z, _vlog_x); - _vlog_z = v_fma(_vlog_e, _vlog_C1_fp64, _vlog_z); + _vlog_z = v_add(_vlog_z, _vlog_x); + _vlog_z = v_fma(_vlog_e, _vlog_C1_fp64, _vlog_z); - // log(0) -> -INF - v_float64 mask_zero = v_eq(x, vx_setzero_f64()); - _vlog_z = v_select(mask_zero, v_reinterpret_as_f64(vx_setall_s64(0xfff0000000000000)), _vlog_z); - // log(NEG), log(NAN) -> NAN - v_float64 mask_not_nan = v_ge(x, vx_setzero_f64()); - _vlog_z = v_select(mask_not_nan, _vlog_z, v_reinterpret_as_f64(vx_setall_s64(0x7ff8000000000000))); - // log(INF) -> INF - v_float64 mask_inf = v_eq(x, v_reinterpret_as_f64(vx_setall_s64(0x7ff0000000000000))); - _vlog_z = v_select(mask_inf, x, _vlog_z); - return _vlog_z; - } -#endif - -#define OPENCV_HAL_MATH_HAVE_LOG 1 + // log(0) -> -INF + _TpVec64F mask_zero = v_eq(x, v_setzero_<_TpVec64F>()); + _vlog_z = v_select(mask_zero, v_reinterpret_as_f64(v_setall_<_TpVec64S>((int64)0xfff0000000000000)), _vlog_z); + // log(NEG), log(NAN) -> NAN + _TpVec64F mask_not_nan = v_ge(x, v_setzero_<_TpVec64F>()); + _vlog_z = v_select(mask_not_nan, _vlog_z, v_reinterpret_as_f64(v_setall_<_TpVec64S>((int64)0x7ff8000000000000))); + // log(INF) -> INF + _TpVec64F mask_inf = v_eq(x, v_reinterpret_as_f64(v_setall_<_TpVec64S>((int64)0x7ff0000000000000))); + _vlog_z = v_select(mask_inf, x, _vlog_z); + return _vlog_z; +} //! @} -#endif /* This implementation is derived from the approximation approach of Error Function (Erf) from PyTorch https://github.com/pytorch/pytorch/blob/9c50ecc84b9a6e699a7f058891b889aafbf976c7/aten/src/ATen/cpu/vec/vec512/vec512_float.h#L189-L220 */ -#ifndef OPENCV_HAL_MATH_HAVE_ERF - //! @name Error Function //! @{ +template +inline _TpVec32F v_erf_default_32f(const _TpVec32F &v) { + const _TpVec32F coef0 = v_setall_<_TpVec32F>(0.3275911f), + coef1 = v_setall_<_TpVec32F>(1.061405429f), + coef2 = v_setall_<_TpVec32F>(-1.453152027f), + coef3 = v_setall_<_TpVec32F>(1.421413741f), + coef4 = v_setall_<_TpVec32F>(-0.284496736f), + coef5 = v_setall_<_TpVec32F>(0.254829592f), + ones = v_setall_<_TpVec32F>(1.0f), + neg_zeros = v_setall_<_TpVec32F>(-0.f); + _TpVec32F t = v_abs(v); + // sign(v) + _TpVec32F sign_mask = v_and(neg_zeros, v); - inline v_float32 v_erf(const v_float32 &v) { - const v_float32 coef0 = vx_setall_f32(0.3275911f), - coef1 = vx_setall_f32(1.061405429f), - coef2 = vx_setall_f32(-1.453152027f), - coef3 = vx_setall_f32(1.421413741f), - coef4 = vx_setall_f32(-0.284496736f), - coef5 = vx_setall_f32(0.254829592f), - ones = vx_setall_f32(1.0f), - neg_zeros = vx_setall_f32(-0.f); - v_float32 t = v_abs(v); - // sign(v) - v_float32 sign_mask = v_and(neg_zeros, v); - - t = v_div(ones, v_fma(coef0, t, ones)); - v_float32 r = v_fma(coef1, t, coef2); - r = v_fma(r, t, coef3); - r = v_fma(r, t, coef4); - r = v_fma(r, t, coef5); - // - v * v - v_float32 pow_2 = v_mul(v, v); - v_float32 neg_pow_2 = v_xor(neg_zeros, pow_2); - // - exp(- v * v) - v_float32 exp = v_exp(neg_pow_2); - v_float32 neg_exp = v_xor(neg_zeros, exp); - v_float32 res = v_mul(t, neg_exp); - res = v_fma(r, res, ones); - return v_xor(sign_mask, res); - } - -#define OPENCV_HAL_MATH_HAVE_ERF 1 + t = v_div(ones, v_fma(coef0, t, ones)); + _TpVec32F r = v_fma(coef1, t, coef2); + r = v_fma(r, t, coef3); + r = v_fma(r, t, coef4); + r = v_fma(r, t, coef5); + // - v * v + _TpVec32F v2 = v_mul(v, v); + _TpVec32F mv2 = v_xor(neg_zeros, v2); + // - exp(- v * v) + _TpVec32F exp = v_exp_default_32f<_TpVec32F, _TpVec32S>(mv2); + _TpVec32F neg_exp = v_xor(neg_zeros, exp); + _TpVec32F res = v_mul(t, neg_exp); + res = v_fma(r, res, ones); + return v_xor(sign_mask, res); +} //! @} -#endif // OPENCV_HAL_MATH_HAVE_ERF - - -} -#endif // OPENCV_HAL_INTRIN_HPP +#endif // OPENCV_HAL_INTRIN_MATH_HPP diff --git a/modules/core/include/opencv2/core/hal/intrin_msa.hpp b/modules/core/include/opencv2/core/hal/intrin_msa.hpp index 8d2c22b087..94dc8f55e5 100644 --- a/modules/core/include/opencv2/core/hal/intrin_msa.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_msa.hpp @@ -235,6 +235,8 @@ struct v_float64x2 #define OPENCV_HAL_IMPL_MSA_INIT(_Tpv, _Tp, suffix) \ inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(msa_dupq_n_##suffix((_Tp)0)); } \ inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(msa_dupq_n_##suffix(v)); } \ +template <> inline v_##_Tpv v_setzero_() { return v_setzero_##suffix(); } \ +template <> inline v_##_Tpv v_setall_(_Tp v) { return v_setall_##suffix(v); } \ inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16(MSA_TPV_REINTERPRET(v16u8, v.val)); } \ inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16(MSA_TPV_REINTERPRET(v16i8, v.val)); } \ inline v_uint16x8 v_reinterpret_as_u16(const v_##_Tpv& v) { return v_uint16x8(MSA_TPV_REINTERPRET(v8u16, v.val)); } \ @@ -1861,6 +1863,14 @@ inline void v_pack_store(hfloat* ptr, const v_float32x4& v) inline void v_cleanup() {} +#include "intrin_math.hpp" +inline v_float32x4 v_exp(v_float32x4 x) { return v_exp_default_32f(x); } +inline v_float32x4 v_log(v_float32x4 x) { return v_log_default_32f(x); } +inline v_float32x4 v_erf(v_float32x4 x) { return v_erf_default_32f(x); } + +inline v_float64x2 v_exp(v_float64x2 x) { return v_exp_default_64f(x); } +inline v_float64x2 v_log(v_float64x2 x) { return v_log_default_64f(x); } + CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END //! @endcond diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp index 6e843d68ea..da8dd4acfa 100644 --- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp @@ -381,6 +381,8 @@ private: #define OPENCV_HAL_IMPL_NEON_INIT(_Tpv, _Tp, suffix) \ inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(vdupq_n_##suffix((_Tp)0)); } \ inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(vdupq_n_##suffix(v)); } \ +template <> inline v_##_Tpv v_setzero_() { return v_setzero_##suffix(); } \ +template <> inline v_##_Tpv v_setall_(_Tp v) { return v_setall_##suffix(v); } \ inline _Tpv##_t vreinterpretq_##suffix##_##suffix(_Tpv##_t v) { return v; } \ inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16(vreinterpretq_u8_##suffix(v.val)); } \ inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16(vreinterpretq_s8_##suffix(v.val)); } \ @@ -2646,6 +2648,19 @@ inline void v_pack_store(hfloat* ptr, const v_float32x4& v) inline void v_cleanup() {} +#include "intrin_math.hpp" +#if defined(CV_SIMD_FP16) && CV_SIMD_FP16 +inline v_float16x8 v_exp(v_float16x8 x) { return v_exp_default_16f(x); } +inline v_float16x8 v_log(v_float16x8 x) { return v_log_default_16f(x); } +#endif +inline v_float32x4 v_exp(v_float32x4 x) { return v_exp_default_32f(x); } +inline v_float32x4 v_log(v_float32x4 x) { return v_log_default_32f(x); } +inline v_float32x4 v_erf(v_float32x4 x) { return v_erf_default_32f(x); } +#if CV_SIMD128_64F +inline v_float64x2 v_exp(v_float64x2 x) { return v_exp_default_64f(x); } +inline v_float64x2 v_log(v_float64x2 x) { return v_log_default_64f(x); } +#endif + CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END //! @endcond diff --git a/modules/core/include/opencv2/core/hal/intrin_rvv071.hpp b/modules/core/include/opencv2/core/hal/intrin_rvv071.hpp index 4900418df3..332d433a89 100644 --- a/modules/core/include/opencv2/core/hal/intrin_rvv071.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_rvv071.hpp @@ -355,7 +355,9 @@ inline v_float64x2 v_reinterpret_as_f64(const v_float64x2& v) { return v_float64 #define OPENCV_HAL_IMPL_RISCVV_INIT_SET(__Tp, _Tp, suffix, len, num) \ inline v_##_Tp##x##num v_setzero_##suffix() { return v_##_Tp##x##num(vmv_v_x_##len##m1(0, num)); } \ -inline v_##_Tp##x##num v_setall_##suffix(__Tp v) { return v_##_Tp##x##num(vmv_v_x_##len##m1(v, num)); } +inline v_##_Tp##x##num v_setall_##suffix(__Tp v) { return v_##_Tp##x##num(vmv_v_x_##len##m1(v, num)); } \ +template <> inline v_##_Tp##x##num v_setzero_() { return v_setzero_##suffix(); } \ +template <> inline v_##_Tp##x##num v_setall_(__Tp v) { return v_setall_##suffix(v); } OPENCV_HAL_IMPL_RISCVV_INIT_SET(uchar, uint8, u8, u8, 16) OPENCV_HAL_IMPL_RISCVV_INIT_SET(char, int8, s8, i8, 16) @@ -371,6 +373,11 @@ inline v_float32x4 v_setall_f32(float v) { return v_float32x4(vfmv_v_f_f32m1(v, inline v_float64x2 v_setzero_f64() { return v_float64x2(vfmv_v_f_f64m1(0, 2)); } inline v_float64x2 v_setall_f64(double v) { return v_float64x2(vfmv_v_f_f64m1(v, 2)); } +template <> inline v_float32x4 v_setzero_() { return v_setzero_f32(); } +template <> inline v_float32x4 v_setall_(float v) { return v_setall_f32(v); } + +template <> inline v_float64x2 v_setzero_() { return v_setzero_f64(); } +template <> inline v_float64x2 v_setall_(double v) { return v_setall_f64(v); } #define OPENCV_HAL_IMPL_RISCVV_BIN_OP(bin_op, _Tpvec, intrin) \ inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \ @@ -2859,6 +2866,14 @@ inline void v_pack_store(hfloat* ptr, const v_float32x4& v) inline void v_cleanup() {} +#include "intrin_math.hpp" +inline v_float32x4 v_exp(v_float32x4 x) { return v_exp_default_32f(x); } +inline v_float32x4 v_log(v_float32x4 x) { return v_log_default_32f(x); } +inline v_float32x4 v_erf(v_float32x4 x) { return v_erf_default_32f(x); } + +inline v_float64x2 v_exp(v_float64x2 x) { return v_exp_default_64f(x); } +inline v_float64x2 v_log(v_float64x2 x) { return v_log_default_64f(x); } + CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END //! @endcond diff --git a/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp b/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp index 28b0ad8a82..de80e2fccd 100644 --- a/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_rvv_scalable.hpp @@ -160,6 +160,14 @@ inline v_##_Tpvec v_setzero_##suffix1() \ inline v_##_Tpvec v_setall_##suffix1(_Tp v) \ { \ return __riscv_vmv_v_x_##suffix2##m1(v, vl); \ +} \ +template <> inline v_##_Tpvec v_setzero_() \ +{ \ + return v_setzero_##suffix1(); \ +} \ +template <> inline v_##_Tpvec v_setall_(_Tp v) \ +{ \ + return v_setall_##suffix1(v); \ } OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint8, uchar, u8, u8, VTraits::vlanes()) @@ -179,6 +187,14 @@ inline v_##_Tpv v_setzero_##suffix() \ inline v_##_Tpv v_setall_##suffix(_Tp v) \ { \ return __riscv_vfmv_v_f_##suffix##m1(v, vl); \ +} \ +template <> inline v_##_Tpv v_setzero_() \ +{ \ + return v_setzero_##suffix(); \ +} \ +template <> inline v_##_Tpv v_setall_(_Tp v) \ +{ \ + return v_setall_##suffix(v); \ } OPENCV_HAL_IMPL_RVV_INIT_FP(float32, float, f32, VTraits::vlanes()) @@ -2164,6 +2180,14 @@ inline v_float32 v_matmuladd(const v_float32& v, const v_float32& m0, inline void v_cleanup() {} +#include "intrin_math.hpp" +inline v_float32 v_exp(v_float32 x) { return v_exp_default_32f(x); } +inline v_float32 v_log(v_float32 x) { return v_log_default_32f(x); } +inline v_float32 v_erf(v_float32 x) { return v_erf_default_32f(x); } + +inline v_float64 v_exp(v_float64 x) { return v_exp_default_64f(x); } +inline v_float64 v_log(v_float64 x) { return v_log_default_64f(x); } + CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END //! @endcond diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp index f4761c96b4..6f6cbbf9fd 100644 --- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp @@ -347,6 +347,8 @@ namespace hal_sse_internal #define OPENCV_HAL_IMPL_SSE_INITVEC(_Tpvec, _Tp, suffix, zsuffix, ssuffix, _Tps, cast) \ inline _Tpvec v_setzero_##suffix() { return _Tpvec(_mm_setzero_##zsuffix()); } \ inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(_mm_set1_##ssuffix((_Tps)v)); } \ +template <> inline _Tpvec v_setzero_() { return v_setzero_##suffix(); } \ +template <> inline _Tpvec v_setall_(_Tp v) { return v_setall_##suffix(v); } \ template inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0& a) \ { return _Tpvec(cast(a.val)); } @@ -364,6 +366,11 @@ inline v_int64x2 v_setzero_s64() { return v_int64x2(_mm_setzero_si128()); } inline v_uint64x2 v_setall_u64(uint64 val) { return v_uint64x2(val, val); } inline v_int64x2 v_setall_s64(int64 val) { return v_int64x2(val, val); } +template <> inline v_uint64x2 v_setzero_() { return v_setzero_u64(); } +template <> inline v_int64x2 v_setzero_() { return v_setzero_s64(); } +template <> inline v_uint64x2 v_setall_(uint64 val) { return v_setall_u64(val); } +template <> inline v_int64x2 v_setall_(int64 val) { return v_setall_s64(val); } + template inline v_uint64x2 v_reinterpret_as_u64(const _Tpvec& a) { return v_uint64x2(a.val); } template inline @@ -3452,6 +3459,14 @@ inline void v_pack_store(hfloat* ptr, const v_float32x4& v) inline void v_cleanup() {} +#include "intrin_math.hpp" +inline v_float32x4 v_exp(v_float32x4 x) { return v_exp_default_32f(x); } +inline v_float32x4 v_log(v_float32x4 x) { return v_log_default_32f(x); } +inline v_float32x4 v_erf(v_float32x4 x) { return v_erf_default_32f(x); } + +inline v_float64x2 v_exp(v_float64x2 x) { return v_exp_default_64f(x); } +inline v_float64x2 v_log(v_float64x2 x) { return v_log_default_64f(x); } + CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END //! @endcond diff --git a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp index fbe690461a..99684ba8c3 100644 --- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp @@ -261,6 +261,8 @@ OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_float64x2, double) #define OPENCV_HAL_IMPL_VSX_INITVEC(_Tpvec, _Tp, suffix, cast) \ inline _Tpvec v_setzero_##suffix() { return _Tpvec(vec_splats((_Tp)0)); } \ inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(vec_splats((_Tp)v));} \ +template <> inline _Tpvec v_setzero_() { return v_setzero_##suffix(); } \ +template <> inline _Tpvec v_setall_(_Tp v) { return v_setall_##suffix(_Tp v); } \ template inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0 &a) \ { return _Tpvec((cast)a.val); } @@ -1594,6 +1596,13 @@ template inline Tvec v_broadcast_element(const Tvec& v) { return Tvec(vec_splat(v.val, i)); } +#include "intrin_math.hpp" +inline v_float32x4 v_exp(v_float32x4 x) { return v_exp_default_32f(x); } +inline v_float32x4 v_log(v_float32x4 x) { return v_log_default_32f(x); } +inline v_float32x4 v_erf(v_float32x4 x) { return v_erf_default_32f(x); } + +inline v_float64x2 v_exp(v_float64x2 x) { return v_exp_default_64f(x); } +inline v_float64x2 v_log(v_float64x2 x) { return v_log_default_64f(x); } CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END diff --git a/modules/core/include/opencv2/core/hal/intrin_wasm.hpp b/modules/core/include/opencv2/core/hal/intrin_wasm.hpp index 3a8069ca91..f5e5f05beb 100644 --- a/modules/core/include/opencv2/core/hal/intrin_wasm.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_wasm.hpp @@ -392,6 +392,8 @@ inline v128_t v128_cvti32x4_i64x2_high(const v128_t& a) #define OPENCV_HAL_IMPL_WASM_INITVEC(_Tpvec, _Tp, suffix, zsuffix, _Tps) \ inline _Tpvec v_setzero_##suffix() { return _Tpvec(wasm_##zsuffix##_splat((_Tps)0)); } \ inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(wasm_##zsuffix##_splat((_Tps)v)); } \ +template <> inline _Tpvec v_setzero_() { return v_setzero_##suffix(); } \ +template <> inline _Tpvec v_setall_(_Tp v) { return v_setall_##suffix(v); } \ template inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0& a) \ { return _Tpvec(a.val); } @@ -2767,6 +2769,14 @@ inline void v_pack_store(hfloat* ptr, const v_float32x4& v) inline void v_cleanup() {} +#include "intrin_math.hpp" +inline v_float32x4 v_exp(v_float32x4 x) { return v_exp_default_32f(x); } +inline v_float32x4 v_log(v_float32x4 x) { return v_log_default_32f(x); } +inline v_float32x4 v_erf(v_float32x4 x) { return v_erf_default_32f(x); } + +inline v_float64x2 v_exp(v_float64x2 x) { return v_exp_default_64f(x); } +inline v_float64x2 v_log(v_float64x2 x) { return v_log_default_64f(x); } + CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END //! @endcond diff --git a/modules/core/test/test_intrin_utils.hpp b/modules/core/test/test_intrin_utils.hpp index ad8faf7bfb..9eed2d2da3 100644 --- a/modules/core/test/test_intrin_utils.hpp +++ b/modules/core/test/test_intrin_utils.hpp @@ -300,14 +300,20 @@ template struct TheTest #else #error "Configuration error" #endif + R setall_res3 = v_setall_((LaneType)7); + R setall_resz = v_setzero_(); #if CV_SIMD_WIDTH > 0 Data setall_res1_; v_store(setall_res1_.d, setall_res1); Data setall_res2_; v_store(setall_res2_.d, setall_res2); + Data setall_res3_; v_store(setall_res3_.d, setall_res3); + Data setall_resz_; v_store(setall_resz_.d, setall_resz); for (int i = 0; i < VTraits::vlanes(); ++i) { SCOPED_TRACE(cv::format("i=%d", i)); EXPECT_EQ((LaneType)5, setall_res1_[i]); EXPECT_EQ((LaneType)6, setall_res2_[i]); + EXPECT_EQ((LaneType)7, setall_res3_[i]); + EXPECT_EQ((LaneType)0, setall_resz_[i]); } #endif