mirror of
https://github.com/opencv/opencv.git
synced 2024-12-11 22:59:16 +08:00
Merge pull request #24941 from WanliZhong:v_exp
Add support for v_exp (exponential) #24941 This PR aims to implement `v_exp(v_float16 x)`, `v_exp(v_float32 x)` and `v_exp(v_float64 x)`. ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake
This commit is contained in:
parent
75339a5528
commit
6e1864e3fc
@ -1239,6 +1239,7 @@ namespace CV__SIMD_NAMESPACE {
|
||||
#define CV_SIMD 0
|
||||
#endif
|
||||
|
||||
#include "intrin_math.hpp"
|
||||
#include "simd_utils.impl.hpp"
|
||||
|
||||
#ifndef CV_DOXYGEN
|
||||
|
@ -263,7 +263,7 @@ Most of these operations return only one value.
|
||||
|
||||
### Other math
|
||||
|
||||
- Some frequent operations: @ref v_sqrt, @ref v_invsqrt, @ref v_magnitude, @ref v_sqr_magnitude
|
||||
- Some frequent operations: @ref v_sqrt, @ref v_invsqrt, @ref v_magnitude, @ref v_sqr_magnitude, @ref v_exp
|
||||
- Absolute values: @ref v_abs, @ref v_absdiff, @ref v_absdiffs
|
||||
|
||||
### Conversions
|
||||
@ -363,6 +363,7 @@ Floating point:
|
||||
|reverse | x | x |
|
||||
|extract_n | x | x |
|
||||
|broadcast_element | x | |
|
||||
|exp | x | x |
|
||||
|
||||
@{ */
|
||||
|
||||
@ -721,11 +722,33 @@ template<typename _Tp, int n> inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a)
|
||||
Only for floating point types.*/
|
||||
OPENCV_HAL_IMPL_MATH_FUNC(v_sqrt, std::sqrt, _Tp)
|
||||
|
||||
/**
|
||||
* @brief Exponential \f$ e^x \f$ of elements
|
||||
*
|
||||
* Only for floating point types. Core implementation steps:
|
||||
* 1. Decompose Input: Convert the input to \f$ 2^{x \cdot \log_2e} \f$ and split its exponential into integer and fractional parts:
|
||||
* \f$ x \cdot \log_2e = n + f \f$, where \f$ n \f$ is the integer part and \f$ f \f$ is the fractional part.
|
||||
* 2. Compute \f$ 2^n \f$: Calculated by shifting the bits.
|
||||
* 3. Adjust Fractional Part: Compute \f$ f \cdot \ln2 \f$ to convert the fractional part to base \f$ e \f$.
|
||||
* \f$ C1 \f$ and \f$ C2 \f$ are used to adjust the fractional part.
|
||||
* 4. Polynomial Approximation for \f$ e^{f \cdot \ln2} \f$: The closer the fractional part is to 0, the more accurate the result.
|
||||
* - For float16 and float32, use a Taylor Series with 6 terms.
|
||||
* - For float64, use Pade Polynomials Approximation with 4 terms.
|
||||
* 5. Combine Results: Multiply the two parts together to get the final result:
|
||||
* \f$ e^x = 2^n \cdot e^{f \cdot \ln2} \f$.
|
||||
*
|
||||
* @note The precision of the calculation depends on the implementation and the data type of the input vector.
|
||||
*/
|
||||
OPENCV_HAL_IMPL_MATH_FUNC(v_exp, std::exp, _Tp)
|
||||
#define OPENCV_HAL_MATH_HAVE_EXP 1
|
||||
|
||||
//! @cond IGNORED
|
||||
OPENCV_HAL_IMPL_MATH_FUNC(v_sin, std::sin, _Tp)
|
||||
#define OPENCV_HAL_MATH_HAVE_SIN 1
|
||||
OPENCV_HAL_IMPL_MATH_FUNC(v_cos, std::cos, _Tp)
|
||||
OPENCV_HAL_IMPL_MATH_FUNC(v_exp, std::exp, _Tp)
|
||||
#define OPENCV_HAL_MATH_HAVE_COS 1
|
||||
OPENCV_HAL_IMPL_MATH_FUNC(v_log, std::log, _Tp)
|
||||
#define OPENCV_HAL_MATH_HAVE_LOG 1
|
||||
//! @endcond
|
||||
|
||||
/** @brief Absolute value of elements
|
||||
|
200
modules/core/include/opencv2/core/hal/intrin_math.hpp
Normal file
200
modules/core/include/opencv2/core/hal/intrin_math.hpp
Normal file
@ -0,0 +1,200 @@
|
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html
|
||||
|
||||
// This header is not standalone. Don't include directly, use "intrin.hpp" instead.
|
||||
#ifdef OPENCV_HAL_INTRIN_HPP // defined in intrin.hpp
|
||||
|
||||
namespace CV__SIMD_NAMESPACE {
|
||||
|
||||
/* Universal Intrinsics implementation of sin, cos, exp and log
|
||||
|
||||
Inspired by Intel Approximate Math library, and based on the
|
||||
corresponding algorithms of the cephes math library
|
||||
*/
|
||||
|
||||
/* Copyright (C) 2010,2011 RJVB - extensions */
|
||||
/* Copyright (C) 2011 Julien Pommier
|
||||
|
||||
This software is provided 'as-is', without any express or implied
|
||||
warranty. In no event will the authors be held liable for any damages
|
||||
arising from the use of this software.
|
||||
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it
|
||||
freely, subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not
|
||||
claim that you wrote the original software. If you use this software
|
||||
in a product, an acknowledgment in the product documentation would be
|
||||
appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be
|
||||
misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
|
||||
(this is the zlib license)
|
||||
*/
|
||||
|
||||
#ifndef OPENCV_HAL_MATH_HAVE_EXP
|
||||
|
||||
//! @name Exponential
|
||||
//! @{
|
||||
#if defined(CV_SIMD_FP16) && CV_SIMD_FP16
|
||||
// Implementation is the same as float32 vector.
|
||||
inline v_float16 v_exp(const v_float16 &x) {
|
||||
const v_float16 _vexp_lo_f16 = vx_setall_f16(-10.7421875f);
|
||||
const v_float16 _vexp_hi_f16 = vx_setall_f16(11.f);
|
||||
const v_float16 _vexp_half_fp16 = vx_setall_f16(0.5f);
|
||||
const v_float16 _vexp_one_fp16 = vx_setall_f16(1.f);
|
||||
const v_float16 _vexp_LOG2EF_f16 = vx_setall_f16(1.44269504088896341f);
|
||||
const v_float16 _vexp_C1_f16 = vx_setall_f16(-6.93359375E-1f);
|
||||
const v_float16 _vexp_C2_f16 = vx_setall_f16(2.12194440E-4f);
|
||||
const v_float16 _vexp_p0_f16 = vx_setall_f16(1.9875691500E-4f);
|
||||
const v_float16 _vexp_p1_f16 = vx_setall_f16(1.3981999507E-3f);
|
||||
const v_float16 _vexp_p2_f16 = vx_setall_f16(8.3334519073E-3f);
|
||||
const v_float16 _vexp_p3_f16 = vx_setall_f16(4.1665795894E-2f);
|
||||
const v_float16 _vexp_p4_f16 = vx_setall_f16(1.6666665459E-1f);
|
||||
const v_float16 _vexp_p5_f16 = vx_setall_f16(5.0000001201E-1f);
|
||||
const v_int16 _vexp_bias_s16 = vx_setall_s16(0xf);
|
||||
|
||||
v_float16 _vexp_, _vexp_x, _vexp_y, _vexp_xx;
|
||||
v_int16 _vexp_mm;
|
||||
|
||||
// compute exponential of x
|
||||
_vexp_x = v_max(x, _vexp_lo_f16);
|
||||
_vexp_x = v_min(_vexp_x, _vexp_hi_f16);
|
||||
|
||||
_vexp_ = v_fma(_vexp_x, _vexp_LOG2EF_f16, _vexp_half_fp16);
|
||||
_vexp_mm = v_floor(_vexp_);
|
||||
_vexp_ = v_cvt_f16(_vexp_mm);
|
||||
_vexp_mm = v_add(_vexp_mm, _vexp_bias_s16);
|
||||
_vexp_mm = v_shl(_vexp_mm, 10);
|
||||
|
||||
_vexp_x = v_fma(_vexp_, _vexp_C1_f16, _vexp_x);
|
||||
_vexp_x = v_fma(_vexp_, _vexp_C2_f16, _vexp_x);
|
||||
_vexp_xx = v_mul(_vexp_x, _vexp_x);
|
||||
|
||||
_vexp_y = v_fma(_vexp_x, _vexp_p0_f16, _vexp_p1_f16);
|
||||
_vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p2_f16);
|
||||
_vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p3_f16);
|
||||
_vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p4_f16);
|
||||
_vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p5_f16);
|
||||
|
||||
_vexp_y = v_fma(_vexp_y, _vexp_xx, _vexp_x);
|
||||
_vexp_y = v_add(_vexp_y, _vexp_one_fp16);
|
||||
_vexp_y = v_mul(_vexp_y, v_reinterpret_as_f16(_vexp_mm));
|
||||
|
||||
// exp(NAN) -> NAN
|
||||
v_float16 mask_not_nan = v_not_nan(x);
|
||||
return v_select(mask_not_nan, _vexp_y, v_reinterpret_as_f16(vx_setall_s16(0x7e00)));
|
||||
}
|
||||
#endif
|
||||
|
||||
inline v_float32 v_exp(const v_float32 &x) {
|
||||
const v_float32 _vexp_lo_f32 = vx_setall_f32(-88.3762626647949f);
|
||||
const v_float32 _vexp_hi_f32 = vx_setall_f32(89.f);
|
||||
const v_float32 _vexp_half_fp32 = vx_setall_f32(0.5f);
|
||||
const v_float32 _vexp_one_fp32 = vx_setall_f32(1.f);
|
||||
const v_float32 _vexp_LOG2EF_f32 = vx_setall_f32(1.44269504088896341f);
|
||||
const v_float32 _vexp_C1_f32 = vx_setall_f32(-6.93359375E-1f);
|
||||
const v_float32 _vexp_C2_f32 = vx_setall_f32(2.12194440E-4f);
|
||||
const v_float32 _vexp_p0_f32 = vx_setall_f32(1.9875691500E-4f);
|
||||
const v_float32 _vexp_p1_f32 = vx_setall_f32(1.3981999507E-3f);
|
||||
const v_float32 _vexp_p2_f32 = vx_setall_f32(8.3334519073E-3f);
|
||||
const v_float32 _vexp_p3_f32 = vx_setall_f32(4.1665795894E-2f);
|
||||
const v_float32 _vexp_p4_f32 = vx_setall_f32(1.6666665459E-1f);
|
||||
const v_float32 _vexp_p5_f32 = vx_setall_f32(5.0000001201E-1f);
|
||||
const v_int32 _vexp_bias_s32 = vx_setall_s32(0x7f);
|
||||
|
||||
v_float32 _vexp_, _vexp_x, _vexp_y, _vexp_xx;
|
||||
v_int32 _vexp_mm;
|
||||
|
||||
// compute exponential of x
|
||||
_vexp_x = v_max(x, _vexp_lo_f32);
|
||||
_vexp_x = v_min(_vexp_x, _vexp_hi_f32);
|
||||
|
||||
_vexp_ = v_fma(_vexp_x, _vexp_LOG2EF_f32, _vexp_half_fp32);
|
||||
_vexp_mm = v_floor(_vexp_);
|
||||
_vexp_ = v_cvt_f32(_vexp_mm);
|
||||
_vexp_mm = v_add(_vexp_mm, _vexp_bias_s32);
|
||||
_vexp_mm = v_shl(_vexp_mm, 23);
|
||||
|
||||
_vexp_x = v_fma(_vexp_, _vexp_C1_f32, _vexp_x);
|
||||
_vexp_x = v_fma(_vexp_, _vexp_C2_f32, _vexp_x);
|
||||
_vexp_xx = v_mul(_vexp_x, _vexp_x);
|
||||
|
||||
_vexp_y = v_fma(_vexp_x, _vexp_p0_f32, _vexp_p1_f32);
|
||||
_vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p2_f32);
|
||||
_vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p3_f32);
|
||||
_vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p4_f32);
|
||||
_vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p5_f32);
|
||||
|
||||
_vexp_y = v_fma(_vexp_y, _vexp_xx, _vexp_x);
|
||||
_vexp_y = v_add(_vexp_y, _vexp_one_fp32);
|
||||
_vexp_y = v_mul(_vexp_y, v_reinterpret_as_f32(_vexp_mm));
|
||||
|
||||
// exp(NAN) -> NAN
|
||||
v_float32 mask_not_nan = v_not_nan(x);
|
||||
return v_select(mask_not_nan, _vexp_y, v_reinterpret_as_f32(vx_setall_s32(0x7fc00000)));
|
||||
}
|
||||
|
||||
#if CV_SIMD_64F || CV_SIMD_SCALABLE_64F
|
||||
inline v_float64 v_exp(const v_float64 &x) {
|
||||
const v_float64 _vexp_lo_f64 = vx_setall_f64(-709.43613930310391424428);
|
||||
const v_float64 _vexp_hi_f64 = vx_setall_f64(710.);
|
||||
const v_float64 _vexp_half_f64 = vx_setall_f64(0.5);
|
||||
const v_float64 _vexp_one_f64 = vx_setall_f64(1.0);
|
||||
const v_float64 _vexp_two_f64 = vx_setall_f64(2.0);
|
||||
const v_float64 _vexp_LOG2EF_f64 = vx_setall_f64(1.44269504088896340736);
|
||||
const v_float64 _vexp_C1_f64 = vx_setall_f64(-6.93145751953125E-1);
|
||||
const v_float64 _vexp_C2_f64 = vx_setall_f64(-1.42860682030941723212E-6);
|
||||
const v_float64 _vexp_p0_f64 = vx_setall_f64(1.26177193074810590878E-4);
|
||||
const v_float64 _vexp_p1_f64 = vx_setall_f64(3.02994407707441961300E-2);
|
||||
const v_float64 _vexp_p2_f64 = vx_setall_f64(9.99999999999999999910E-1);
|
||||
const v_float64 _vexp_q0_f64 = vx_setall_f64(3.00198505138664455042E-6);
|
||||
const v_float64 _vexp_q1_f64 = vx_setall_f64(2.52448340349684104192E-3);
|
||||
const v_float64 _vexp_q2_f64 = vx_setall_f64(2.27265548208155028766E-1);
|
||||
const v_float64 _vexp_q3_f64 = vx_setall_f64(2.00000000000000000009E0);
|
||||
const v_int64 _vexp_bias_s64 = vx_setall_s64(0x3ff);
|
||||
|
||||
v_float64 _vexp_, _vexp_x, _vexp_y, _vexp_z, _vexp_xx;
|
||||
v_int64 _vexp_mm;
|
||||
|
||||
// compute exponential of x
|
||||
_vexp_x = v_max(x, _vexp_lo_f64);
|
||||
_vexp_x = v_min(_vexp_x, _vexp_hi_f64);
|
||||
|
||||
_vexp_ = v_fma(_vexp_x, _vexp_LOG2EF_f64, _vexp_half_f64);
|
||||
_vexp_mm = v_expand_low(v_floor(_vexp_));
|
||||
_vexp_ = v_cvt_f64(_vexp_mm);
|
||||
_vexp_mm = v_add(_vexp_mm, _vexp_bias_s64);
|
||||
_vexp_mm = v_shl(_vexp_mm, 52);
|
||||
|
||||
_vexp_x = v_fma(_vexp_, _vexp_C1_f64, _vexp_x);
|
||||
_vexp_x = v_fma(_vexp_, _vexp_C2_f64, _vexp_x);
|
||||
_vexp_xx = v_mul(_vexp_x, _vexp_x);
|
||||
|
||||
_vexp_y = v_fma(_vexp_xx, _vexp_p0_f64, _vexp_p1_f64);
|
||||
_vexp_y = v_fma(_vexp_y, _vexp_xx, _vexp_p2_f64);
|
||||
_vexp_y = v_mul(_vexp_y, _vexp_x);
|
||||
|
||||
_vexp_z = v_fma(_vexp_xx, _vexp_q0_f64, _vexp_q1_f64);
|
||||
_vexp_z = v_fma(_vexp_xx, _vexp_z, _vexp_q2_f64);
|
||||
_vexp_z = v_fma(_vexp_xx, _vexp_z, _vexp_q3_f64);
|
||||
|
||||
_vexp_z = v_div(_vexp_y, v_sub(_vexp_z, _vexp_y));
|
||||
_vexp_z = v_fma(_vexp_two_f64, _vexp_z, _vexp_one_f64);
|
||||
_vexp_z = v_mul(_vexp_z, v_reinterpret_as_f64(_vexp_mm));
|
||||
|
||||
// exp(NAN) -> NAN
|
||||
v_float64 mask_not_nan = v_not_nan(x);
|
||||
return v_select(mask_not_nan, _vexp_z, v_reinterpret_as_f64(vx_setall_s64(0x7FF8000000000000)));
|
||||
}
|
||||
#endif
|
||||
|
||||
#define OPENCV_HAL_MATH_HAVE_EXP 1
|
||||
//! @}
|
||||
|
||||
#endif
|
||||
}
|
||||
#endif // OPENCV_HAL_INTRIN_HPP
|
@ -1698,6 +1698,103 @@ template<typename R> struct TheTest
|
||||
return *this;
|
||||
}
|
||||
|
||||
void __test_exp(LaneType dataMax, LaneType diff_thr, LaneType enlarge_factor, LaneType flt_min) {
|
||||
int n = VTraits<R>::vlanes();
|
||||
|
||||
// Test overflow and underflow values with step
|
||||
const LaneType step = (LaneType) 0.01;
|
||||
for (LaneType i = dataMax + 1; i <= dataMax + 11;) {
|
||||
Data<R> dataUpperBound, dataLowerBound, resOverflow, resUnderflow;
|
||||
for (int j = 0; j < n; ++j) {
|
||||
dataUpperBound[j] = i;
|
||||
dataLowerBound[j] = -i;
|
||||
i += step;
|
||||
}
|
||||
R upperBound = dataUpperBound, lowerBound = dataLowerBound;
|
||||
resOverflow = v_exp(upperBound);
|
||||
resUnderflow = v_exp(lowerBound);
|
||||
for (int j = 0; j < n; ++j) {
|
||||
SCOPED_TRACE(cv::format("Overflow/Underflow test value: %f", i));
|
||||
EXPECT_TRUE(resOverflow[j] > 0 && std::isinf(resOverflow[j]));
|
||||
EXPECT_GE(resUnderflow[j], 0);
|
||||
EXPECT_LT(resUnderflow[j], flt_min);
|
||||
}
|
||||
}
|
||||
|
||||
// Test random values combined with special values
|
||||
std::vector<LaneType> specialValues = {0, 1, INFINITY, -INFINITY, NAN, dataMax};
|
||||
const int testRandNum = 10000;
|
||||
const double specialValueProbability = 0.1; // 10% chance to insert a special value
|
||||
cv::RNG_MT19937 rng;
|
||||
|
||||
for (int i = 0; i < testRandNum; i++) {
|
||||
Data<R> dataRand, resRand;
|
||||
for (int j = 0; j < n; ++j) {
|
||||
if (rng.uniform(0.f, 1.f) <= specialValueProbability) {
|
||||
// Insert a special value
|
||||
int specialValueIndex = rng.uniform(0, (int) specialValues.size());
|
||||
dataRand[j] = specialValues[specialValueIndex];
|
||||
} else {
|
||||
// Generate random data in [-dataMax*1.1, dataMax*1.1]
|
||||
dataRand[j] = (LaneType) rng.uniform(-dataMax * 1.1, dataMax * 1.1);
|
||||
}
|
||||
}
|
||||
// Compare with std::exp
|
||||
R x = dataRand;
|
||||
resRand = v_exp(x);
|
||||
for (int j = 0; j < n; ++j) {
|
||||
SCOPED_TRACE(cv::format("Random test value: %f", dataRand[j]));
|
||||
LaneType std_exp = std::exp(dataRand[j]);
|
||||
if (dataRand[j] == 0) {
|
||||
// input 0 -> output 1
|
||||
EXPECT_EQ(resRand[j], 1);
|
||||
} else if (dataRand[j] == 1) {
|
||||
// input 1 -> output e
|
||||
EXPECT_NEAR((LaneType) M_E, resRand[j], 1e-15);
|
||||
} else if (dataRand[j] > 0 && std::isinf(dataRand[j])) {
|
||||
// input INF -> output INF
|
||||
EXPECT_TRUE(resRand[j] > 0 && std::isinf(resRand[j]));
|
||||
} else if (dataRand[j] < 0 && std::isinf(dataRand[j])) {
|
||||
// input -INF -> output 0
|
||||
EXPECT_EQ(resRand[j], 0);
|
||||
} else if (std::isnan(dataRand[j])) {
|
||||
// input NaN -> output NaN
|
||||
EXPECT_TRUE(std::isnan(resRand[j]));
|
||||
} else if (dataRand[j] == dataMax) {
|
||||
// input dataMax -> output less than INFINITY
|
||||
EXPECT_LT(resRand[j], (LaneType) INFINITY);
|
||||
} else if (std::isinf(resRand[j])) {
|
||||
// output INF -> input close to edge
|
||||
EXPECT_GT(dataRand[j], dataMax);
|
||||
} else {
|
||||
EXPECT_GE(resRand[j], 0);
|
||||
EXPECT_LT(std::abs(resRand[j] - std_exp), diff_thr * (std_exp + flt_min * enlarge_factor));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TheTest &test_exp_fp16() {
|
||||
#if CV_SIMD_FP16
|
||||
float16_t flt16_min;
|
||||
uint16_t flt16_min_hex = 0x0400;
|
||||
std::memcpy(&flt16_min, &flt16_min_hex, sizeof(float16_t));
|
||||
__test_exp((float16_t) 10, (float16_t) 1e-2, (float16_t) 1e2, flt16_min);
|
||||
#endif
|
||||
return *this;
|
||||
}
|
||||
|
||||
TheTest &test_exp_fp32() {
|
||||
__test_exp(88.0f, 1e-6f, 1e6f, FLT_MIN);
|
||||
return *this;
|
||||
}
|
||||
|
||||
TheTest &test_exp_fp64() {
|
||||
#if CV_SIMD_64F || CV_SIMD_SCALABLE_64F
|
||||
__test_exp(709.0, 1e-15, 1e15, DBL_MIN);
|
||||
#endif
|
||||
return *this;
|
||||
}
|
||||
};
|
||||
|
||||
#define DUMP_ENTRY(type) printf("SIMD%d: %s\n", 8*VTraits<v_uint8>::vlanes(), CV__TRACE_FUNCTION);
|
||||
@ -2011,6 +2108,7 @@ void test_hal_intrin_float32()
|
||||
.test_extract_highest()
|
||||
.test_broadcast_highest()
|
||||
.test_pack_triplets()
|
||||
.test_exp_fp32()
|
||||
#if CV_SIMD_WIDTH == 32
|
||||
.test_extract<4>().test_extract<5>().test_extract<6>().test_extract<7>()
|
||||
.test_rotate<4>().test_rotate<5>().test_rotate<6>().test_rotate<7>()
|
||||
@ -2035,13 +2133,13 @@ void test_hal_intrin_float64()
|
||||
.test_mask()
|
||||
.test_unpack()
|
||||
.test_float_math()
|
||||
.test_round_pair_f64()
|
||||
.test_float_cvt32()
|
||||
.test_reverse()
|
||||
.test_extract<0>().test_extract<1>()
|
||||
.test_rotate<0>().test_rotate<1>()
|
||||
.test_extract_n<0>().test_extract_n<1>()
|
||||
.test_extract_highest()
|
||||
.test_exp_fp64()
|
||||
//.test_broadcast_element<0>().test_broadcast_element<1>()
|
||||
#if CV_SIMD_WIDTH == 32
|
||||
.test_extract<2>().test_extract<3>()
|
||||
@ -2062,6 +2160,7 @@ void test_hal_intrin_float16()
|
||||
#if CV_SIMD_FP16
|
||||
.test_loadstore_fp16()
|
||||
.test_float_cvt_fp16()
|
||||
.test_exp_fp16()
|
||||
#endif
|
||||
;
|
||||
#else
|
||||
|
@ -71,48 +71,12 @@ void softmax(Mat &dst, const Mat &src, int axis, int axisBias, int axisStep){
|
||||
// calculate the exp value along the axis
|
||||
v_float32 vs = vx_setzero_f32();
|
||||
vmax = vx_setall_f32(maxVal);
|
||||
// initialize vexp constant
|
||||
v_float32 _vexp_lo = vx_setall_f32(-88.3762626647949f);
|
||||
v_float32 _vexp_hi = vx_setall_f32(88.3762626647949f);
|
||||
v_float32 _vexp_half = vx_setall_f32(0.5f);
|
||||
v_float32 _vexp_one = vx_setall_f32(1.f);
|
||||
v_float32 _vexp_LOG2EF = vx_setall_f32(1.44269504088896341f);
|
||||
v_float32 _vexp_C1 = vx_setall_f32(-0.693359375f);
|
||||
v_float32 _vexp_C2 = vx_setall_f32(2.12194440e-4f);
|
||||
v_float32 _vexp_p0 = vx_setall_f32(1.9875691500E-4f);
|
||||
v_float32 _vexp_p1 = vx_setall_f32(1.3981999507E-3f);
|
||||
v_float32 _vexp_p2 = vx_setall_f32(8.3334519073E-3f);
|
||||
v_float32 _vexp_p3 = vx_setall_f32(4.1665795894E-2f);
|
||||
v_float32 _vexp_p4 = vx_setall_f32(1.6666665459E-1f);
|
||||
v_float32 _vexp_p5 = vx_setall_f32(5.0000001201E-1f);
|
||||
// initialize temp vectors for vexp
|
||||
v_float32 val, _vexp_, _vexp_x, _vexp_y, _vexp_z;
|
||||
v_int32 _vexp_mm;
|
||||
|
||||
v_float32 val;
|
||||
// calculate and sum all data along axis
|
||||
for (size_t cnDim = 0; cnDim < axisStep; cnDim += nlanes) {
|
||||
val = vx_load(axisBuf + cnDim);
|
||||
val = v_sub(val, vmax);
|
||||
|
||||
// compute vexp of val
|
||||
_vexp_x = v_min(val, _vexp_hi);
|
||||
_vexp_x = v_max(_vexp_x, _vexp_lo);
|
||||
_vexp_ = v_fma(_vexp_x, _vexp_LOG2EF, _vexp_half);
|
||||
_vexp_mm = v_floor(_vexp_);
|
||||
_vexp_ = v_cvt_f32(_vexp_mm);
|
||||
_vexp_mm = v_add(_vexp_mm, vx_setall_s32(0x7f));
|
||||
_vexp_mm = v_shl(_vexp_mm, 23);
|
||||
_vexp_x = v_fma(_vexp_, _vexp_C1, _vexp_x);
|
||||
_vexp_x = v_fma(_vexp_, _vexp_C2, _vexp_x);
|
||||
_vexp_z = v_mul(_vexp_x, _vexp_x);
|
||||
_vexp_y = v_fma(_vexp_x, _vexp_p0, _vexp_p1);
|
||||
_vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p2);
|
||||
_vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p3);
|
||||
_vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p4);
|
||||
_vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p5);
|
||||
_vexp_y = v_fma(_vexp_y, _vexp_z, _vexp_x);
|
||||
_vexp_y = v_add(_vexp_y, _vexp_one);
|
||||
val = v_mul(_vexp_y, v_reinterpret_as_f32(_vexp_mm));
|
||||
val = v_exp(val);
|
||||
|
||||
vs = v_add(vs, val);
|
||||
v_store(axisBuf + cnDim, val);
|
||||
|
Loading…
Reference in New Issue
Block a user