mirror of
https://github.com/opencv/opencv.git
synced 2025-01-18 14:13:15 +08:00
let the performance test pass on ARM
* use round-to-neaerest in div of carotene
This commit is contained in:
parent
8213e57f3b
commit
971a7c88dc
11
3rdparty/carotene/src/div.cpp
vendored
11
3rdparty/carotene/src/div.cpp
vendored
@ -51,6 +51,13 @@ namespace {
|
||||
|
||||
#ifdef CAROTENE_NEON
|
||||
|
||||
inline float32x4_t vroundq(const float32x4_t& v)
|
||||
{
|
||||
const int32x4_t signMask = vdupq_n_s32(1 << 31), half = vreinterpretq_s32_f32(vdupq_n_f32(0.5f));
|
||||
float32x4_t v_addition = vreinterpretq_f32_s32(vorrq_s32(half, vandq_s32(signMask, vreinterpretq_s32_f32(v))));
|
||||
return vaddq_f32(v, v_addition);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline T divSaturateQ(const T &v1, const T &v2, const float scale)
|
||||
{
|
||||
@ -62,10 +69,10 @@ inline T divSaturateQ(const T &v1, const T &v2, const float scale)
|
||||
}
|
||||
template <>
|
||||
inline int32x4_t divSaturateQ<int32x4_t>(const int32x4_t &v1, const int32x4_t &v2, const float scale)
|
||||
{ return vcvtq_s32_f32(vmulq_f32(vmulq_n_f32(vcvtq_f32_s32(v1), scale), internal::vrecpq_f32(vcvtq_f32_s32(v2)))); }
|
||||
{ return vcvtq_s32_f32(vroundq(vmulq_f32(vmulq_n_f32(vcvtq_f32_s32(v1), scale), internal::vrecpq_f32(vcvtq_f32_s32(v2))))); }
|
||||
template <>
|
||||
inline uint32x4_t divSaturateQ<uint32x4_t>(const uint32x4_t &v1, const uint32x4_t &v2, const float scale)
|
||||
{ return vcvtq_u32_f32(vmulq_f32(vmulq_n_f32(vcvtq_f32_u32(v1), scale), internal::vrecpq_f32(vcvtq_f32_u32(v2)))); }
|
||||
{ return vcvtq_u32_f32(vroundq(vmulq_f32(vmulq_n_f32(vcvtq_f32_u32(v1), scale), internal::vrecpq_f32(vcvtq_f32_u32(v2))))); }
|
||||
|
||||
template <typename T>
|
||||
inline T divSaturate(const T &v1, const T &v2, const float scale)
|
||||
|
Loading…
Reference in New Issue
Block a user