Merge pull request #26369 from WanliZhong:5x_fix_hfloat_vfunc

Fix hfloat conflicts of v_func in merging 4.x to 5.x #26369

This PR solves the conflicts in merging 4.x to 5.x https://github.com/opencv/opencv/pull/26358
1. Explicitly convert the inputs number for `v_setall_` to hfloat number
2. Loosens the threshold for `v_sincos` test. (related issue: https://github.com/opencv/opencv/issues/26362)
3. Remove the new but temp api `template <> inline v_float16x8 v_setall_(float v) { return v_setall_f16((hfloat)v); }`

### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [x] The PR is proposed to the proper branch
- [ ] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake
This commit is contained in:
Wanli 2024-10-27 00:54:13 +08:00 committed by GitHub
parent 05e7988e9c
commit 29e712ed93
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 29 additions and 33 deletions

View File

@ -38,19 +38,19 @@
// Implementation is the same as float32 vector.
template<typename _TpVec16F, typename _TpVec16S>
inline _TpVec16F v_exp_default_16f(const _TpVec16F &x) {
const _TpVec16F _vexp_lo_f16 = v_setall_<_TpVec16F>(-10.7421875f);
const _TpVec16F _vexp_hi_f16 = v_setall_<_TpVec16F>(11.f);
const _TpVec16F _vexp_half_fp16 = v_setall_<_TpVec16F>(0.5f);
const _TpVec16F _vexp_one_fp16 = v_setall_<_TpVec16F>(1.f);
const _TpVec16F _vexp_LOG2EF_f16 = v_setall_<_TpVec16F>(1.44269504088896341f);
const _TpVec16F _vexp_C1_f16 = v_setall_<_TpVec16F>(-6.93359375E-1f);
const _TpVec16F _vexp_C2_f16 = v_setall_<_TpVec16F>(2.12194440E-4f);
const _TpVec16F _vexp_p0_f16 = v_setall_<_TpVec16F>(1.9875691500E-4f);
const _TpVec16F _vexp_p1_f16 = v_setall_<_TpVec16F>(1.3981999507E-3f);
const _TpVec16F _vexp_p2_f16 = v_setall_<_TpVec16F>(8.3334519073E-3f);
const _TpVec16F _vexp_p3_f16 = v_setall_<_TpVec16F>(4.1665795894E-2f);
const _TpVec16F _vexp_p4_f16 = v_setall_<_TpVec16F>(1.6666665459E-1f);
const _TpVec16F _vexp_p5_f16 = v_setall_<_TpVec16F>(5.0000001201E-1f);
const _TpVec16F _vexp_lo_f16 = v_setall_<_TpVec16F>(hfloat(-10.7421875f));
const _TpVec16F _vexp_hi_f16 = v_setall_<_TpVec16F>(hfloat(11.f));
const _TpVec16F _vexp_half_fp16 = v_setall_<_TpVec16F>(hfloat(0.5f));
const _TpVec16F _vexp_one_fp16 = v_setall_<_TpVec16F>(hfloat(1.f));
const _TpVec16F _vexp_LOG2EF_f16 = v_setall_<_TpVec16F>(hfloat(1.44269504088896341f));
const _TpVec16F _vexp_C1_f16 = v_setall_<_TpVec16F>(hfloat(-6.93359375E-1f));
const _TpVec16F _vexp_C2_f16 = v_setall_<_TpVec16F>(hfloat(2.12194440E-4f));
const _TpVec16F _vexp_p0_f16 = v_setall_<_TpVec16F>(hfloat(1.9875691500E-4f));
const _TpVec16F _vexp_p1_f16 = v_setall_<_TpVec16F>(hfloat(1.3981999507E-3f));
const _TpVec16F _vexp_p2_f16 = v_setall_<_TpVec16F>(hfloat(8.3334519073E-3f));
const _TpVec16F _vexp_p3_f16 = v_setall_<_TpVec16F>(hfloat(4.1665795894E-2f));
const _TpVec16F _vexp_p4_f16 = v_setall_<_TpVec16F>(hfloat(1.6666665459E-1f));
const _TpVec16F _vexp_p5_f16 = v_setall_<_TpVec16F>(hfloat(5.0000001201E-1f));
_TpVec16F _vexp_, _vexp_x, _vexp_y, _vexp_xx;
_TpVec16S _vexp_mm;
@ -192,19 +192,19 @@ inline _TpVec64F v_exp_default_64f(const _TpVec64F &x) {
//! @{
template<typename _TpVec16F, typename _TpVec16S>
inline _TpVec16F v_log_default_16f(const _TpVec16F &x) {
const _TpVec16F _vlog_one_fp16 = v_setall_<_TpVec16F>(1.0f);
const _TpVec16F _vlog_SQRTHF_fp16 = v_setall_<_TpVec16F>(0.707106781186547524f);
const _TpVec16F _vlog_q1_fp16 = v_setall_<_TpVec16F>(-2.12194440E-4f);
const _TpVec16F _vlog_q2_fp16 = v_setall_<_TpVec16F>(0.693359375f);
const _TpVec16F _vlog_p0_fp16 = v_setall_<_TpVec16F>(7.0376836292E-2f);
const _TpVec16F _vlog_p1_fp16 = v_setall_<_TpVec16F>(-1.1514610310E-1f);
const _TpVec16F _vlog_p2_fp16 = v_setall_<_TpVec16F>(1.1676998740E-1f);
const _TpVec16F _vlog_p3_fp16 = v_setall_<_TpVec16F>(-1.2420140846E-1f);
const _TpVec16F _vlog_p4_fp16 = v_setall_<_TpVec16F>(1.4249322787E-1f);
const _TpVec16F _vlog_p5_fp16 = v_setall_<_TpVec16F>(-1.6668057665E-1f);
const _TpVec16F _vlog_p6_fp16 = v_setall_<_TpVec16F>(2.0000714765E-1f);
const _TpVec16F _vlog_p7_fp16 = v_setall_<_TpVec16F>(-2.4999993993E-1f);
const _TpVec16F _vlog_p8_fp16 = v_setall_<_TpVec16F>(3.3333331174E-1f);
const _TpVec16F _vlog_one_fp16 = v_setall_<_TpVec16F>(hfloat(1.0f));
const _TpVec16F _vlog_SQRTHF_fp16 = v_setall_<_TpVec16F>(hfloat(0.707106781186547524f));
const _TpVec16F _vlog_q1_fp16 = v_setall_<_TpVec16F>(hfloat(-2.12194440E-4f));
const _TpVec16F _vlog_q2_fp16 = v_setall_<_TpVec16F>(hfloat(0.693359375f));
const _TpVec16F _vlog_p0_fp16 = v_setall_<_TpVec16F>(hfloat(7.0376836292E-2f));
const _TpVec16F _vlog_p1_fp16 = v_setall_<_TpVec16F>(hfloat(-1.1514610310E-1f));
const _TpVec16F _vlog_p2_fp16 = v_setall_<_TpVec16F>(hfloat(1.1676998740E-1f));
const _TpVec16F _vlog_p3_fp16 = v_setall_<_TpVec16F>(hfloat(-1.2420140846E-1f));
const _TpVec16F _vlog_p4_fp16 = v_setall_<_TpVec16F>(hfloat(1.4249322787E-1f));
const _TpVec16F _vlog_p5_fp16 = v_setall_<_TpVec16F>(hfloat(-1.6668057665E-1f));
const _TpVec16F _vlog_p6_fp16 = v_setall_<_TpVec16F>(hfloat(2.0000714765E-1f));
const _TpVec16F _vlog_p7_fp16 = v_setall_<_TpVec16F>(hfloat(-2.4999993993E-1f));
const _TpVec16F _vlog_p8_fp16 = v_setall_<_TpVec16F>(hfloat(3.3333331174E-1f));
_TpVec16F _vlog_x, _vlog_e, _vlog_y, _vlog_z, _vlog_tmp;
_TpVec16S _vlog_ux, _vlog_emm0;
@ -214,7 +214,7 @@ inline _TpVec16F v_log_default_16f(const _TpVec16F &x) {
_vlog_emm0 = v_shr(_vlog_ux, 10);
_vlog_ux = v_and(_vlog_ux, _vlog_inv_mant_mask_s16);
_vlog_ux = v_or(_vlog_ux, v_reinterpret_as_s16(v_setall_<_TpVec16F>(0.5f)));
_vlog_ux = v_or(_vlog_ux, v_reinterpret_as_s16(v_setall_<_TpVec16F>(hfloat(0.5f))));
_vlog_x = v_reinterpret_as_f16(_vlog_ux);
_vlog_emm0 = v_sub(_vlog_emm0, v_setall_<_TpVec16S>((short)0xf));
@ -243,7 +243,7 @@ inline _TpVec16F v_log_default_16f(const _TpVec16F &x) {
_vlog_y = v_fma(_vlog_e, _vlog_q1_fp16, _vlog_y);
_vlog_y = v_sub(_vlog_y, v_mul(_vlog_z, v_setall_<_TpVec16F>(0.5f)));
_vlog_y = v_sub(_vlog_y, v_mul(_vlog_z, v_setall_<_TpVec16F>(hfloat(0.5f))));
_vlog_x = v_add(_vlog_x, _vlog_y);
_vlog_x = v_fma(_vlog_e, _vlog_q2_fp16, _vlog_x);

View File

@ -437,7 +437,6 @@ OPENCV_HAL_IMPL_NEON_INIT(uint64x2, uint64, uint64, u64)
OPENCV_HAL_IMPL_NEON_INIT(int64x2, int64, int64, s64)
#if CV_SIMD128_FP16
OPENCV_HAL_IMPL_NEON_INIT(float16x8, hfloat, __fp16, f16);
template <> inline v_float16x8 v_setall_(float v) { return v_setall_f16((hfloat)v); }
#define OPENCV_HAL_IMPL_NEON_INIT_FP16(_Tpv, suffix) \
inline v_float16x8 v_reinterpret_as_f16(const v_##_Tpv& v) { return v_float16x8(vreinterpretq_f16_##suffix(v.val)); }
OPENCV_HAL_IMPL_NEON_INIT_FP16(uint8x16, u8)

View File

@ -2162,14 +2162,11 @@ template<typename R> struct TheTest
}
}
// BUG: https://github.com/opencv/opencv/issues/26362
TheTest &test_sincos_fp16() {
#if 0 // CV_SIMD_FP16
hfloat flt16_min;
uint16_t flt16_min_hex = 0x0400;
std::memcpy(&flt16_min, &flt16_min_hex, sizeof(hfloat));
__test_sincos((hfloat) 1e-3, flt16_min);
#endif
__test_sincos((hfloat) 4e-3, flt16_min);
return *this;
}