diff --git a/3rdparty/carotene/src/div.cpp b/3rdparty/carotene/src/div.cpp index 9c03202a83..dbd60e70ea 100644 --- a/3rdparty/carotene/src/div.cpp +++ b/3rdparty/carotene/src/div.cpp @@ -51,6 +51,13 @@ namespace { #ifdef CAROTENE_NEON +inline float32x4_t vroundq(const float32x4_t& v) +{ + const int32x4_t signMask = vdupq_n_s32(1 << 31), half = vreinterpretq_s32_f32(vdupq_n_f32(0.5f)); + float32x4_t v_addition = vreinterpretq_f32_s32(vorrq_s32(half, vandq_s32(signMask, vreinterpretq_s32_f32(v)))); + return vaddq_f32(v, v_addition); +} + template inline T divSaturateQ(const T &v1, const T &v2, const float scale) { @@ -62,10 +69,10 @@ inline T divSaturateQ(const T &v1, const T &v2, const float scale) } template <> inline int32x4_t divSaturateQ(const int32x4_t &v1, const int32x4_t &v2, const float scale) -{ return vcvtq_s32_f32(vmulq_f32(vmulq_n_f32(vcvtq_f32_s32(v1), scale), internal::vrecpq_f32(vcvtq_f32_s32(v2)))); } +{ return vcvtq_s32_f32(vroundq(vmulq_f32(vmulq_n_f32(vcvtq_f32_s32(v1), scale), internal::vrecpq_f32(vcvtq_f32_s32(v2))))); } template <> inline uint32x4_t divSaturateQ(const uint32x4_t &v1, const uint32x4_t &v2, const float scale) -{ return vcvtq_u32_f32(vmulq_f32(vmulq_n_f32(vcvtq_f32_u32(v1), scale), internal::vrecpq_f32(vcvtq_f32_u32(v2)))); } +{ return vcvtq_u32_f32(vroundq(vmulq_f32(vmulq_n_f32(vcvtq_f32_u32(v1), scale), internal::vrecpq_f32(vcvtq_f32_u32(v2))))); } template inline T divSaturate(const T &v1, const T &v2, const float scale)