diff --git a/3rdparty/carotene/CMakeLists.txt b/3rdparty/carotene/CMakeLists.txt index ebcdf1a9f6..aa95956e7f 100644 --- a/3rdparty/carotene/CMakeLists.txt +++ b/3rdparty/carotene/CMakeLists.txt @@ -42,6 +42,14 @@ endif() if(WITH_NEON) target_compile_definitions(carotene_objs PRIVATE "-DWITH_NEON") + if(NOT DEFINED CAROTENE_NEON_ARCH ) + elseif(CAROTENE_NEON_ARCH EQUAL 8) + target_compile_definitions(carotene_objs PRIVATE "-DCAROTENE_NEON_ARCH=8") + elseif(CAROTENE_NEON_ARCH EQUAL 7) + target_compile_definitions(carotene_objs PRIVATE "-DCAROTENE_NEON_ARCH=7") + else() + target_compile_definitions(carotene_objs PRIVATE "-DCAROTENE_NEON_ARCH=0") + endif() endif() # we add dummy file to fix XCode build diff --git a/3rdparty/carotene/src/add_weighted.cpp b/3rdparty/carotene/src/add_weighted.cpp index 6559b9fe53..7e2945e88c 100644 --- a/3rdparty/carotene/src/add_weighted.cpp +++ b/3rdparty/carotene/src/add_weighted.cpp @@ -39,6 +39,7 @@ #include "common.hpp" #include "vtransform.hpp" +#include "vround_helper.hpp" namespace CAROTENE_NS { @@ -106,7 +107,7 @@ template <> struct wAdd { valpha = vdupq_n_f32(_alpha); vbeta = vdupq_n_f32(_beta); - vgamma = vdupq_n_f32(_gamma + 0.5); + vgamma = vdupq_n_f32(_gamma); } void operator() (const VecTraits::vec128 & v_src0, @@ -118,7 +119,7 @@ template <> struct wAdd vs1 = vmlaq_f32(vgamma, vs1, valpha); vs1 = vmlaq_f32(vs1, vs2, vbeta); - v_dst = vcvtq_s32_f32(vs1); + v_dst = vroundq_s32_f32(vs1); } void operator() (const VecTraits::vec64 & v_src0, @@ -130,7 +131,7 @@ template <> struct wAdd vs1 = vmla_f32(vget_low(vgamma), vs1, vget_low(valpha)); vs1 = vmla_f32(vs1, vs2, vget_low(vbeta)); - v_dst = vcvt_s32_f32(vs1); + v_dst = vround_s32_f32(vs1); } void operator() (const s32 * src0, const s32 * src1, s32 * dst) const @@ -150,7 +151,7 @@ template <> struct wAdd { valpha = vdupq_n_f32(_alpha); vbeta = vdupq_n_f32(_beta); - vgamma = vdupq_n_f32(_gamma + 0.5); + vgamma = vdupq_n_f32(_gamma); } void operator() (const VecTraits::vec128 & v_src0, @@ -162,7 +163,7 @@ template <> struct wAdd vs1 = vmlaq_f32(vgamma, vs1, valpha); vs1 = vmlaq_f32(vs1, vs2, vbeta); - v_dst = vcvtq_u32_f32(vs1); + v_dst = vroundq_u32_f32(vs1); } void operator() (const VecTraits::vec64 & v_src0, @@ -174,7 +175,7 @@ template <> struct wAdd vs1 = vmla_f32(vget_low(vgamma), vs1, vget_low(valpha)); vs1 = vmla_f32(vs1, vs2, vget_low(vbeta)); - v_dst = vcvt_u32_f32(vs1); + v_dst = vround_u32_f32(vs1); } void operator() (const u32 * src0, const u32 * src1, u32 * dst) const diff --git a/3rdparty/carotene/src/blur.cpp b/3rdparty/carotene/src/blur.cpp index 21689a2bd3..30c1f8a729 100644 --- a/3rdparty/carotene/src/blur.cpp +++ b/3rdparty/carotene/src/blur.cpp @@ -41,6 +41,7 @@ #include "common.hpp" #include "saturate_cast.hpp" +#include "vround_helper.hpp" namespace CAROTENE_NS { @@ -198,7 +199,6 @@ void blur3x3(const Size2D &size, s32 cn, //#define FLOAT_VARIANT_1_9 #ifdef FLOAT_VARIANT_1_9 float32x4_t v1_9 = vdupq_n_f32 (1.0/9.0); - float32x4_t v0_5 = vdupq_n_f32 (.5); #else const int16x8_t vScale = vmovq_n_s16(3640); #endif @@ -283,8 +283,8 @@ void blur3x3(const Size2D &size, s32 cn, uint32x4_t tres2 = vmovl_u16(vget_high_u16(t0)); float32x4_t vf1 = vmulq_f32(v1_9, vcvtq_f32_u32(tres1)); float32x4_t vf2 = vmulq_f32(v1_9, vcvtq_f32_u32(tres2)); - tres1 = vcvtq_u32_f32(vaddq_f32(vf1, v0_5)); - tres2 = vcvtq_u32_f32(vaddq_f32(vf2, v0_5)); + tres1 = internal::vroundq_u32_f32(vf1); + tres2 = internal::vroundq_u32_f32(vf2); t0 = vcombine_u16(vmovn_u32(tres1),vmovn_u32(tres2)); vst1_u8(drow + x - 8, vmovn_u16(t0)); #else @@ -445,8 +445,8 @@ void blur3x3(const Size2D &size, s32 cn, uint32x4_t tres2 = vmovl_u16(vget_high_u16(t0)); float32x4_t vf1 = vmulq_f32(v1_9, vcvtq_f32_u32(tres1)); float32x4_t vf2 = vmulq_f32(v1_9, vcvtq_f32_u32(tres2)); - tres1 = vcvtq_u32_f32(vaddq_f32(vf1, v0_5)); - tres2 = vcvtq_u32_f32(vaddq_f32(vf2, v0_5)); + tres1 = internal::vroundq_u32_f32(vf1); + tres2 = internal::vroundq_u32_f32(vf2); t0 = vcombine_u16(vmovn_u32(tres1),vmovn_u32(tres2)); vst1_u8(drow + x - 8, vmovn_u16(t0)); #else @@ -508,7 +508,6 @@ void blur5x5(const Size2D &size, s32 cn, #define FLOAT_VARIANT_1_25 #ifdef FLOAT_VARIANT_1_25 float32x4_t v1_25 = vdupq_n_f32 (1.0f/25.0f); - float32x4_t v0_5 = vdupq_n_f32 (.5f); #else const int16x8_t vScale = vmovq_n_s16(1310); #endif @@ -752,8 +751,8 @@ void blur5x5(const Size2D &size, s32 cn, uint32x4_t tres2 = vmovl_u16(vget_high_u16(t0)); float32x4_t vf1 = vmulq_f32(v1_25, vcvtq_f32_u32(tres1)); float32x4_t vf2 = vmulq_f32(v1_25, vcvtq_f32_u32(tres2)); - tres1 = vcvtq_u32_f32(vaddq_f32(vf1, v0_5)); - tres2 = vcvtq_u32_f32(vaddq_f32(vf2, v0_5)); + tres1 = internal::vroundq_u32_f32(vf1); + tres2 = internal::vroundq_u32_f32(vf2); t0 = vcombine_u16(vmovn_u32(tres1),vmovn_u32(tres2)); vst1_u8(drow + x - 8, vmovn_u16(t0)); #else diff --git a/3rdparty/carotene/src/colorconvert.cpp b/3rdparty/carotene/src/colorconvert.cpp index a8aef9b722..752c65146a 100644 --- a/3rdparty/carotene/src/colorconvert.cpp +++ b/3rdparty/carotene/src/colorconvert.cpp @@ -40,6 +40,7 @@ #include "common.hpp" #include "saturate_cast.hpp" +#include "vround_helper.hpp" namespace CAROTENE_NS { @@ -1166,17 +1167,10 @@ inline uint8x8x3_t convertToHSV(const uint8x8_t vR, const uint8x8_t vG, const ui vSt3 = vmulq_f32(vHF1, vDivTab); vSt4 = vmulq_f32(vHF2, vDivTab); - float32x4_t bias = vdupq_n_f32(0.5f); - - vSt1 = vaddq_f32(vSt1, bias); - vSt2 = vaddq_f32(vSt2, bias); - vSt3 = vaddq_f32(vSt3, bias); - vSt4 = vaddq_f32(vSt4, bias); - - uint32x4_t vRes1 = vcvtq_u32_f32(vSt1); - uint32x4_t vRes2 = vcvtq_u32_f32(vSt2); - uint32x4_t vRes3 = vcvtq_u32_f32(vSt3); - uint32x4_t vRes4 = vcvtq_u32_f32(vSt4); + uint32x4_t vRes1 = internal::vroundq_u32_f32(vSt1); + uint32x4_t vRes2 = internal::vroundq_u32_f32(vSt2); + uint32x4_t vRes3 = internal::vroundq_u32_f32(vSt3); + uint32x4_t vRes4 = internal::vroundq_u32_f32(vSt4); int32x4_t vH_L = vmovl_s16(vget_low_s16(vDiff4)); int32x4_t vH_H = vmovl_s16(vget_high_s16(vDiff4)); diff --git a/3rdparty/carotene/src/common.hpp b/3rdparty/carotene/src/common.hpp index 823ddf1ccf..b9de371a6a 100644 --- a/3rdparty/carotene/src/common.hpp +++ b/3rdparty/carotene/src/common.hpp @@ -58,6 +58,17 @@ namespace CAROTENE_NS { namespace internal { +#ifndef CAROTENE_NEON_ARCH +# if defined(__aarch64__) || defined(__aarch32__) +# define CAROTENE_NEON_ARCH 8 +# else +# define CAROTENE_NEON_ARCH 7 +# endif +#endif +#if ( !defined(__aarch64__) && !defined(__aarch32__) ) && (CAROTENE_NEON_ARCH == 8 ) +# error("ARMv7 doen't support A32/A64 Neon instructions") +#endif + inline void prefetch(const void *ptr, size_t offset = 32*10) { #if defined __GNUC__ diff --git a/3rdparty/carotene/src/convert_scale.cpp b/3rdparty/carotene/src/convert_scale.cpp index d599d24c1e..f88dbea182 100644 --- a/3rdparty/carotene/src/convert_scale.cpp +++ b/3rdparty/carotene/src/convert_scale.cpp @@ -38,6 +38,7 @@ */ #include "common.hpp" +#include "vround_helper.hpp" namespace CAROTENE_NS { @@ -185,7 +186,7 @@ CVTS_FUNC1(u8, 16, #else CVTS_FUNC1(u8, 16, float32x4_t vscale = vdupq_n_f32((f32)alpha); - float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, + float32x4_t vshift = vdupq_n_f32((f32)beta);, { for (size_t i = 0; i < w; i += 16) { @@ -209,10 +210,10 @@ CVTS_FUNC1(u8, 16, vline2_f32 = vaddq_f32(vline2_f32, vshift); vline3_f32 = vaddq_f32(vline3_f32, vshift); vline4_f32 = vaddq_f32(vline4_f32, vshift); - int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32); - int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32); - int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32); - int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32); + int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32); + int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32); + int32x4_t vline3_s32 = internal::vroundq_s32_f32(vline3_f32); + int32x4_t vline4_s32 = internal::vroundq_s32_f32(vline4_f32); uint16x8_t vRes1_u16 = vcombine_u16(vqmovun_s32(vline1_s32), vqmovun_s32(vline2_s32)); uint16x8_t vRes2_u16 = vcombine_u16(vqmovun_s32(vline3_s32), vqmovun_s32(vline4_s32)); vst1q_u8(_dst + i, vcombine_u8(vqmovn_u16(vRes1_u16), vqmovn_u16(vRes2_u16))); @@ -270,7 +271,7 @@ CVTS_FUNC(u8, s8, 16, #else CVTS_FUNC(u8, s8, 16, float32x4_t vscale = vdupq_n_f32((f32)alpha); - float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, + float32x4_t vshift = vdupq_n_f32((f32)beta);, { for (size_t i = 0; i < w; i += 16) { @@ -294,10 +295,10 @@ CVTS_FUNC(u8, s8, 16, vline2_f32 = vaddq_f32(vline2_f32, vshift); vline3_f32 = vaddq_f32(vline3_f32, vshift); vline4_f32 = vaddq_f32(vline4_f32, vshift); - int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32); - int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32); - int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32); - int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32); + int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32); + int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32); + int32x4_t vline3_s32 = internal::vroundq_s32_f32(vline3_f32); + int32x4_t vline4_s32 = internal::vroundq_s32_f32(vline4_f32); int16x8_t vRes1_u16 = vcombine_s16(vqmovn_s32(vline1_s32), vqmovn_s32(vline2_s32)); int16x8_t vRes2_u16 = vcombine_s16(vqmovn_s32(vline3_s32), vqmovn_s32(vline4_s32)); vst1q_s8(_dst + i, vcombine_s8(vqmovn_s16(vRes1_u16), vqmovn_s16(vRes2_u16))); @@ -355,7 +356,7 @@ CVTS_FUNC(u8, u16, 16, #else CVTS_FUNC(u8, u16, 16, float32x4_t vscale = vdupq_n_f32((f32)alpha); - float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, + float32x4_t vshift = vdupq_n_f32((f32)beta);, { for (size_t i = 0; i < w; i += 16) { @@ -379,10 +380,10 @@ CVTS_FUNC(u8, u16, 16, vline2_f32 = vaddq_f32(vline2_f32, vshift); vline3_f32 = vaddq_f32(vline3_f32, vshift); vline4_f32 = vaddq_f32(vline4_f32, vshift); - int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32); - int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32); - int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32); - int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32); + int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32); + int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32); + int32x4_t vline3_s32 = internal::vroundq_s32_f32(vline3_f32); + int32x4_t vline4_s32 = internal::vroundq_s32_f32(vline4_f32); vst1q_u16(_dst + i + 0, vcombine_u16(vqmovun_s32(vline1_s32), vqmovun_s32(vline2_s32))); vst1q_u16(_dst + i + 8, vcombine_u16(vqmovun_s32(vline3_s32), vqmovun_s32(vline4_s32))); } @@ -439,7 +440,7 @@ CVTS_FUNC(u8, s16, 16, #else CVTS_FUNC(u8, s16, 16, float32x4_t vscale = vdupq_n_f32((f32)alpha); - float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, + float32x4_t vshift = vdupq_n_f32((f32)beta);, { for (size_t i = 0; i < w; i += 16) { @@ -463,10 +464,10 @@ CVTS_FUNC(u8, s16, 16, vline2_f32 = vaddq_f32(vline2_f32, vshift); vline3_f32 = vaddq_f32(vline3_f32, vshift); vline4_f32 = vaddq_f32(vline4_f32, vshift); - int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32); - int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32); - int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32); - int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32); + int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32); + int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32); + int32x4_t vline3_s32 = internal::vroundq_s32_f32(vline3_f32); + int32x4_t vline4_s32 = internal::vroundq_s32_f32(vline4_f32); vst1q_s16(_dst + i + 0, vcombine_s16(vqmovn_s32(vline1_s32), vqmovn_s32(vline2_s32))); vst1q_s16(_dst + i + 8, vcombine_s16(vqmovn_s32(vline3_s32), vqmovn_s32(vline4_s32))); } @@ -526,7 +527,7 @@ CVTS_FUNC(u8, s32, 16, #else CVTS_FUNC(u8, s32, 16, float32x4_t vscale = vdupq_n_f32((f32)alpha); - float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, + float32x4_t vshift = vdupq_n_f32((f32)beta);, { for (size_t i = 0; i < w; i += 16) { @@ -550,10 +551,10 @@ CVTS_FUNC(u8, s32, 16, vline2_f32 = vaddq_f32(vline2_f32, vshift); vline3_f32 = vaddq_f32(vline3_f32, vshift); vline4_f32 = vaddq_f32(vline4_f32, vshift); - int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32); - int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32); - int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32); - int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32); + int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32); + int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32); + int32x4_t vline3_s32 = internal::vroundq_s32_f32(vline3_f32); + int32x4_t vline4_s32 = internal::vroundq_s32_f32(vline4_f32); vst1q_s32(_dst + i + 0, vline1_s32); vst1q_s32(_dst + i + 4, vline2_s32); vst1q_s32(_dst + i + 8, vline3_s32); @@ -693,7 +694,7 @@ CVTS_FUNC(s8, u8, 16, #else CVTS_FUNC(s8, u8, 16, float32x4_t vscale = vdupq_n_f32((f32)alpha); - float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, + float32x4_t vshift = vdupq_n_f32((f32)beta);, { for (size_t i = 0; i < w; i += 16) { @@ -717,10 +718,10 @@ CVTS_FUNC(s8, u8, 16, vline2_f32 = vaddq_f32(vline2_f32, vshift); vline3_f32 = vaddq_f32(vline3_f32, vshift); vline4_f32 = vaddq_f32(vline4_f32, vshift); - vline1_s32 = vcvtq_s32_f32(vline1_f32); - vline2_s32 = vcvtq_s32_f32(vline2_f32); - vline3_s32 = vcvtq_s32_f32(vline3_f32); - vline4_s32 = vcvtq_s32_f32(vline4_f32); + vline1_s32 = internal::vroundq_s32_f32(vline1_f32); + vline2_s32 = internal::vroundq_s32_f32(vline2_f32); + vline3_s32 = internal::vroundq_s32_f32(vline3_f32); + vline4_s32 = internal::vroundq_s32_f32(vline4_f32); uint16x8_t vRes1_u16 = vcombine_u16(vqmovun_s32(vline1_s32), vqmovun_s32(vline2_s32)); uint16x8_t vRes2_u16 = vcombine_u16(vqmovun_s32(vline3_s32), vqmovun_s32(vline4_s32)); vst1q_u8(_dst + i, vcombine_u8(vqmovn_u16(vRes1_u16), vqmovn_u16(vRes2_u16))); @@ -778,7 +779,7 @@ CVTS_FUNC1(s8, 16, #else CVTS_FUNC1(s8, 16, float32x4_t vscale = vdupq_n_f32((f32)alpha); - float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, + float32x4_t vshift = vdupq_n_f32((f32)beta);, { for (size_t i = 0; i < w; i += 16) { @@ -802,10 +803,10 @@ CVTS_FUNC1(s8, 16, vline2_f32 = vaddq_f32(vline2_f32, vshift); vline3_f32 = vaddq_f32(vline3_f32, vshift); vline4_f32 = vaddq_f32(vline4_f32, vshift); - vline1_s32 = vcvtq_s32_f32(vline1_f32); - vline2_s32 = vcvtq_s32_f32(vline2_f32); - vline3_s32 = vcvtq_s32_f32(vline3_f32); - vline4_s32 = vcvtq_s32_f32(vline4_f32); + vline1_s32 = internal::vroundq_s32_f32(vline1_f32); + vline2_s32 = internal::vroundq_s32_f32(vline2_f32); + vline3_s32 = internal::vroundq_s32_f32(vline3_f32); + vline4_s32 = internal::vroundq_s32_f32(vline4_f32); int16x8_t vRes1_s16 = vcombine_s16(vqmovn_s32(vline1_s32), vqmovn_s32(vline2_s32)); int16x8_t vRes2_s16 = vcombine_s16(vqmovn_s32(vline3_s32), vqmovn_s32(vline4_s32)); vst1q_s8(_dst + i, vcombine_s8(vqmovn_s16(vRes1_s16), vqmovn_s16(vRes2_s16))); @@ -863,7 +864,7 @@ CVTS_FUNC(s8, u16, 16, #else CVTS_FUNC(s8, u16, 16, float32x4_t vscale = vdupq_n_f32((f32)alpha); - float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, + float32x4_t vshift = vdupq_n_f32((f32)beta);, { for (size_t i = 0; i < w; i += 16) { @@ -887,10 +888,10 @@ CVTS_FUNC(s8, u16, 16, vline2_f32 = vaddq_f32(vline2_f32, vshift); vline3_f32 = vaddq_f32(vline3_f32, vshift); vline4_f32 = vaddq_f32(vline4_f32, vshift); - vline1_s32 = vcvtq_s32_f32(vline1_f32); - vline2_s32 = vcvtq_s32_f32(vline2_f32); - vline3_s32 = vcvtq_s32_f32(vline3_f32); - vline4_s32 = vcvtq_s32_f32(vline4_f32); + vline1_s32 = internal::vroundq_s32_f32(vline1_f32); + vline2_s32 = internal::vroundq_s32_f32(vline2_f32); + vline3_s32 = internal::vroundq_s32_f32(vline3_f32); + vline4_s32 = internal::vroundq_s32_f32(vline4_f32); uint16x8_t vRes1_u16 = vcombine_u16(vqmovun_s32(vline1_s32), vqmovun_s32(vline2_s32)); uint16x8_t vRes2_u16 = vcombine_u16(vqmovun_s32(vline3_s32), vqmovun_s32(vline4_s32)); vst1q_u16(_dst + i + 0, vRes1_u16); @@ -949,7 +950,7 @@ CVTS_FUNC(s8, s16, 16, #else CVTS_FUNC(s8, s16, 16, float32x4_t vscale = vdupq_n_f32((f32)alpha); - float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, + float32x4_t vshift = vdupq_n_f32((f32)beta);, { for (size_t i = 0; i < w; i += 16) { @@ -973,10 +974,10 @@ CVTS_FUNC(s8, s16, 16, vline2_f32 = vaddq_f32(vline2_f32, vshift); vline3_f32 = vaddq_f32(vline3_f32, vshift); vline4_f32 = vaddq_f32(vline4_f32, vshift); - vline1_s32 = vcvtq_s32_f32(vline1_f32); - vline2_s32 = vcvtq_s32_f32(vline2_f32); - vline3_s32 = vcvtq_s32_f32(vline3_f32); - vline4_s32 = vcvtq_s32_f32(vline4_f32); + vline1_s32 = internal::vroundq_s32_f32(vline1_f32); + vline2_s32 = internal::vroundq_s32_f32(vline2_f32); + vline3_s32 = internal::vroundq_s32_f32(vline3_f32); + vline4_s32 = internal::vroundq_s32_f32(vline4_f32); int16x8_t vRes1_s16 = vcombine_s16(vqmovn_s32(vline1_s32), vqmovn_s32(vline2_s32)); int16x8_t vRes2_s16 = vcombine_s16(vqmovn_s32(vline3_s32), vqmovn_s32(vline4_s32)); vst1q_s16(_dst + i + 0, vRes1_s16); @@ -1038,7 +1039,7 @@ CVTS_FUNC(s8, s32, 16, #else CVTS_FUNC(s8, s32, 16, float32x4_t vscale = vdupq_n_f32((f32)alpha); - float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, + float32x4_t vshift = vdupq_n_f32((f32)beta);, { for (size_t i = 0; i < w; i += 16) { @@ -1062,10 +1063,10 @@ CVTS_FUNC(s8, s32, 16, vline2_f32 = vaddq_f32(vline2_f32, vshift); vline3_f32 = vaddq_f32(vline3_f32, vshift); vline4_f32 = vaddq_f32(vline4_f32, vshift); - vline1_s32 = vcvtq_s32_f32(vline1_f32); - vline2_s32 = vcvtq_s32_f32(vline2_f32); - vline3_s32 = vcvtq_s32_f32(vline3_f32); - vline4_s32 = vcvtq_s32_f32(vline4_f32); + vline1_s32 = internal::vroundq_s32_f32(vline1_f32); + vline2_s32 = internal::vroundq_s32_f32(vline2_f32); + vline3_s32 = internal::vroundq_s32_f32(vline3_f32); + vline4_s32 = internal::vroundq_s32_f32(vline4_f32); vst1q_s32(_dst + i + 0, vline1_s32); vst1q_s32(_dst + i + 4, vline2_s32); vst1q_s32(_dst + i + 8, vline3_s32); @@ -1190,7 +1191,7 @@ CVTS_FUNC(u16, u8, 16, #else CVTS_FUNC(u16, u8, 16, float32x4_t vscale = vdupq_n_f32((f32)alpha); - float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, + float32x4_t vshift = vdupq_n_f32((f32)beta);, { for (size_t i = 0; i < w; i += 8) { @@ -1204,8 +1205,8 @@ CVTS_FUNC(u16, u8, 16, vline2_f32 = vmulq_f32(vline2_f32, vscale); vline1_f32 = vaddq_f32(vline1_f32, vshift); vline2_f32 = vaddq_f32(vline2_f32, vshift); - int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32); - int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32); + int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32); + int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32); int16x4_t vRes1 = vqmovn_s32(vline1_s32); int16x4_t vRes2 = vqmovn_s32(vline2_s32); uint8x8_t vRes = vqmovun_s16(vcombine_s16(vRes1, vRes2)); @@ -1249,7 +1250,7 @@ CVTS_FUNC(u16, s8, 16, #else CVTS_FUNC(u16, s8, 16, float32x4_t vscale = vdupq_n_f32((f32)alpha); - float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, + float32x4_t vshift = vdupq_n_f32((f32)beta);, { for (size_t i = 0; i < w; i += 8) { @@ -1263,8 +1264,8 @@ CVTS_FUNC(u16, s8, 16, vline2_f32 = vmulq_f32(vline2_f32, vscale); vline1_f32 = vaddq_f32(vline1_f32, vshift); vline2_f32 = vaddq_f32(vline2_f32, vshift); - int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32); - int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32); + int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32); + int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32); int16x4_t vRes1 = vqmovn_s32(vline1_s32); int16x4_t vRes2 = vqmovn_s32(vline2_s32); int8x8_t vRes = vqmovn_s16(vcombine_s16(vRes1, vRes2)); @@ -1307,7 +1308,7 @@ CVTS_FUNC1(u16, 16, #else CVTS_FUNC1(u16, 16, float32x4_t vscale = vdupq_n_f32((f32)alpha); - float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, + float32x4_t vshift = vdupq_n_f32((f32)beta);, { for (size_t i = 0; i < w; i += 8) { @@ -1321,8 +1322,8 @@ CVTS_FUNC1(u16, 16, vline2_f32 = vmulq_f32(vline2_f32, vscale); vline1_f32 = vaddq_f32(vline1_f32, vshift); vline2_f32 = vaddq_f32(vline2_f32, vshift); - int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32); - int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32); + int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32); + int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32); uint16x4_t vRes1 = vqmovun_s32(vline1_s32); uint16x4_t vRes2 = vqmovun_s32(vline2_s32); vst1q_u16(_dst + i, vcombine_u16(vRes1, vRes2)); @@ -1364,7 +1365,7 @@ CVTS_FUNC(u16, s16, 8, #else CVTS_FUNC(u16, s16, 8, float32x4_t vscale = vdupq_n_f32((f32)alpha); - float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, + float32x4_t vshift = vdupq_n_f32((f32)beta);, { for (size_t i = 0; i < w; i += 8) { @@ -1378,8 +1379,8 @@ CVTS_FUNC(u16, s16, 8, vline2_f32 = vmulq_f32(vline2_f32, vscale); vline1_f32 = vaddq_f32(vline1_f32, vshift); vline2_f32 = vaddq_f32(vline2_f32, vshift); - int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32); - int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32); + int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32); + int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32); int16x4_t vRes1 = vqmovn_s32(vline1_s32); int16x4_t vRes2 = vqmovn_s32(vline2_s32); vst1q_s16(_dst + i, vcombine_s16(vRes1, vRes2)); @@ -1421,7 +1422,7 @@ CVTS_FUNC(u16, s32, 8, #else CVTS_FUNC(u16, s32, 8, float32x4_t vscale = vdupq_n_f32((f32)alpha); - float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, + float32x4_t vshift = vdupq_n_f32((f32)beta);, { for (size_t i = 0; i < w; i += 8) { @@ -1435,8 +1436,8 @@ CVTS_FUNC(u16, s32, 8, vline2_f32 = vmulq_f32(vline2_f32, vscale); vline1_f32 = vaddq_f32(vline1_f32, vshift); vline2_f32 = vaddq_f32(vline2_f32, vshift); - int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32); - int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32); + int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32); + int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32); vst1q_s32(_dst + i + 0, vline1_s32); vst1q_s32(_dst + i + 4, vline2_s32); } @@ -1530,7 +1531,7 @@ CVTS_FUNC(s16, u8, 16, #else CVTS_FUNC(s16, u8, 16, float32x4_t vscale = vdupq_n_f32((f32)alpha); - float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, + float32x4_t vshift = vdupq_n_f32((f32)beta);, { for (size_t i = 0; i < w; i += 8) { @@ -1544,8 +1545,8 @@ CVTS_FUNC(s16, u8, 16, vline2_f32 = vmulq_f32(vline2_f32, vscale); vline1_f32 = vaddq_f32(vline1_f32, vshift); vline2_f32 = vaddq_f32(vline2_f32, vshift); - vline1_s32 = vcvtq_s32_f32(vline1_f32); - vline2_s32 = vcvtq_s32_f32(vline2_f32); + vline1_s32 = internal::vroundq_s32_f32(vline1_f32); + vline2_s32 = internal::vroundq_s32_f32(vline2_f32); int16x4_t vRes1 = vqmovn_s32(vline1_s32); int16x4_t vRes2 = vqmovn_s32(vline2_s32); uint8x8_t vRes = vqmovun_s16(vcombine_s16(vRes1, vRes2)); @@ -1589,7 +1590,7 @@ CVTS_FUNC(s16, s8, 16, #else CVTS_FUNC(s16, s8, 16, float32x4_t vscale = vdupq_n_f32((f32)alpha); - float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, + float32x4_t vshift = vdupq_n_f32((f32)beta);, { for (size_t i = 0; i < w; i += 8) { @@ -1603,8 +1604,8 @@ CVTS_FUNC(s16, s8, 16, vline2_f32 = vmulq_f32(vline2_f32, vscale); vline1_f32 = vaddq_f32(vline1_f32, vshift); vline2_f32 = vaddq_f32(vline2_f32, vshift); - vline1_s32 = vcvtq_s32_f32(vline1_f32); - vline2_s32 = vcvtq_s32_f32(vline2_f32); + vline1_s32 = internal::vroundq_s32_f32(vline1_f32); + vline2_s32 = internal::vroundq_s32_f32(vline2_f32); int16x4_t vRes1 = vqmovn_s32(vline1_s32); int16x4_t vRes2 = vqmovn_s32(vline2_s32); int8x8_t vRes = vqmovn_s16(vcombine_s16(vRes1, vRes2)); @@ -1647,7 +1648,7 @@ CVTS_FUNC(s16, u16, 8, #else CVTS_FUNC(s16, u16, 8, float32x4_t vscale = vdupq_n_f32((f32)alpha); - float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, + float32x4_t vshift = vdupq_n_f32((f32)beta);, { for (size_t i = 0; i < w; i += 8) { @@ -1661,8 +1662,8 @@ CVTS_FUNC(s16, u16, 8, vline2_f32 = vmulq_f32(vline2_f32, vscale); vline1_f32 = vaddq_f32(vline1_f32, vshift); vline2_f32 = vaddq_f32(vline2_f32, vshift); - vline1_s32 = vcvtq_s32_f32(vline1_f32); - vline2_s32 = vcvtq_s32_f32(vline2_f32); + vline1_s32 = internal::vroundq_s32_f32(vline1_f32); + vline2_s32 = internal::vroundq_s32_f32(vline2_f32); uint16x4_t vRes1 = vqmovun_s32(vline1_s32); uint16x4_t vRes2 = vqmovun_s32(vline2_s32); vst1q_u16(_dst + i, vcombine_u16(vRes1, vRes2)); @@ -1704,7 +1705,7 @@ CVTS_FUNC1(s16, 16, #else CVTS_FUNC1(s16, 16, float32x4_t vscale = vdupq_n_f32((f32)alpha); - float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, + float32x4_t vshift = vdupq_n_f32((f32)beta);, { for (size_t i = 0; i < w; i += 8) { @@ -1718,8 +1719,8 @@ CVTS_FUNC1(s16, 16, vline2_f32 = vmulq_f32(vline2_f32, vscale); vline1_f32 = vaddq_f32(vline1_f32, vshift); vline2_f32 = vaddq_f32(vline2_f32, vshift); - vline1_s32 = vcvtq_s32_f32(vline1_f32); - vline2_s32 = vcvtq_s32_f32(vline2_f32); + vline1_s32 = internal::vroundq_s32_f32(vline1_f32); + vline2_s32 = internal::vroundq_s32_f32(vline2_f32); int16x4_t vRes1 = vqmovn_s32(vline1_s32); int16x4_t vRes2 = vqmovn_s32(vline2_s32); vst1q_s16(_dst + i, vcombine_s16(vRes1, vRes2)); @@ -1761,7 +1762,7 @@ CVTS_FUNC(s16, s32, 8, #else CVTS_FUNC(s16, s32, 8, float32x4_t vscale = vdupq_n_f32((f32)alpha); - float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, + float32x4_t vshift = vdupq_n_f32((f32)beta);, { for (size_t i = 0; i < w; i += 8) { @@ -1775,8 +1776,8 @@ CVTS_FUNC(s16, s32, 8, vline2_f32 = vmulq_f32(vline2_f32, vscale); vline1_f32 = vaddq_f32(vline1_f32, vshift); vline2_f32 = vaddq_f32(vline2_f32, vshift); - vline1_s32 = vcvtq_s32_f32(vline1_f32); - vline2_s32 = vcvtq_s32_f32(vline2_f32); + vline1_s32 = internal::vroundq_s32_f32(vline1_f32); + vline2_s32 = internal::vroundq_s32_f32(vline2_f32); vst1q_s32(_dst + i + 0, vline1_s32); vst1q_s32(_dst + i + 4, vline2_s32); } @@ -1870,7 +1871,7 @@ CVTS_FUNC(s32, u8, 8, #else CVTS_FUNC(s32, u8, 8, float32x4_t vscale = vdupq_n_f32((f32)alpha); - float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, + float32x4_t vshift = vdupq_n_f32((f32)beta);, { for (size_t i = 0; i < w; i += 8) { @@ -1883,8 +1884,8 @@ CVTS_FUNC(s32, u8, 8, vline2_f32 = vmulq_f32(vline2_f32, vscale); vline1_f32 = vaddq_f32(vline1_f32, vshift); vline2_f32 = vaddq_f32(vline2_f32, vshift); - vline1_s32 = vcvtq_s32_f32(vline1_f32); - vline2_s32 = vcvtq_s32_f32(vline2_f32); + vline1_s32 = internal::vroundq_s32_f32(vline1_f32); + vline2_s32 = internal::vroundq_s32_f32(vline2_f32); uint16x4_t vRes1 = vqmovun_s32(vline1_s32); uint16x4_t vRes2 = vqmovun_s32(vline2_s32); uint8x8_t vRes = vqmovn_u16(vcombine_u16(vRes1, vRes2)); @@ -1928,7 +1929,7 @@ CVTS_FUNC(s32, s8, 8, #else CVTS_FUNC(s32, s8, 8, float32x4_t vscale = vdupq_n_f32((f32)alpha); - float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, + float32x4_t vshift = vdupq_n_f32((f32)beta);, { for (size_t i = 0; i < w; i += 8) { @@ -1941,8 +1942,8 @@ CVTS_FUNC(s32, s8, 8, vline2_f32 = vmulq_f32(vline2_f32, vscale); vline1_f32 = vaddq_f32(vline1_f32, vshift); vline2_f32 = vaddq_f32(vline2_f32, vshift); - vline1_s32 = vcvtq_s32_f32(vline1_f32); - vline2_s32 = vcvtq_s32_f32(vline2_f32); + vline1_s32 = internal::vroundq_s32_f32(vline1_f32); + vline2_s32 = internal::vroundq_s32_f32(vline2_f32); int16x4_t vRes1 = vqmovn_s32(vline1_s32); int16x4_t vRes2 = vqmovn_s32(vline2_s32); int8x8_t vRes = vqmovn_s16(vcombine_s16(vRes1, vRes2)); @@ -1985,7 +1986,7 @@ CVTS_FUNC(s32, u16, 8, #else CVTS_FUNC(s32, u16, 8, float32x4_t vscale = vdupq_n_f32((f32)alpha); - float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, + float32x4_t vshift = vdupq_n_f32((f32)beta);, { for (size_t i = 0; i < w; i += 8) { @@ -1998,8 +1999,8 @@ CVTS_FUNC(s32, u16, 8, vline2_f32 = vmulq_f32(vline2_f32, vscale); vline1_f32 = vaddq_f32(vline1_f32, vshift); vline2_f32 = vaddq_f32(vline2_f32, vshift); - vline1_s32 = vcvtq_s32_f32(vline1_f32); - vline2_s32 = vcvtq_s32_f32(vline2_f32); + vline1_s32 = internal::vroundq_s32_f32(vline1_f32); + vline2_s32 = internal::vroundq_s32_f32(vline2_f32); uint16x4_t vRes1 = vqmovun_s32(vline1_s32); uint16x4_t vRes2 = vqmovun_s32(vline2_s32); vst1q_u16(_dst + i, vcombine_u16(vRes1, vRes2)); @@ -2041,7 +2042,7 @@ CVTS_FUNC(s32, s16, 8, #else CVTS_FUNC(s32, s16, 8, float32x4_t vscale = vdupq_n_f32((f32)alpha); - float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, + float32x4_t vshift = vdupq_n_f32((f32)beta);, { for (size_t i = 0; i < w; i += 8) { @@ -2054,8 +2055,8 @@ CVTS_FUNC(s32, s16, 8, vline2_f32 = vmulq_f32(vline2_f32, vscale); vline1_f32 = vaddq_f32(vline1_f32, vshift); vline2_f32 = vaddq_f32(vline2_f32, vshift); - vline1_s32 = vcvtq_s32_f32(vline1_f32); - vline2_s32 = vcvtq_s32_f32(vline2_f32); + vline1_s32 = internal::vroundq_s32_f32(vline1_f32); + vline2_s32 = internal::vroundq_s32_f32(vline2_f32); int16x4_t vRes1 = vqmovn_s32(vline1_s32); int16x4_t vRes2 = vqmovn_s32(vline2_s32); vst1q_s16(_dst + i, vcombine_s16(vRes1, vRes2)); @@ -2097,7 +2098,7 @@ CVTS_FUNC1(s32, 8, #else CVTS_FUNC1(s32, 8, float32x4_t vscale = vdupq_n_f32((f32)alpha); - float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, + float32x4_t vshift = vdupq_n_f32((f32)beta);, { for (size_t i = 0; i < w; i += 8) { @@ -2110,8 +2111,8 @@ CVTS_FUNC1(s32, 8, vline2_f32 = vmulq_f32(vline2_f32, vscale); vline1_f32 = vaddq_f32(vline1_f32, vshift); vline2_f32 = vaddq_f32(vline2_f32, vshift); - vline1_s32 = vcvtq_s32_f32(vline1_f32); - vline2_s32 = vcvtq_s32_f32(vline2_f32); + vline1_s32 = internal::vroundq_s32_f32(vline1_f32); + vline2_s32 = internal::vroundq_s32_f32(vline2_f32); vst1q_s32(_dst + i + 0, vline1_s32); vst1q_s32(_dst + i + 4, vline2_s32); } @@ -2272,7 +2273,7 @@ CVTS_FUNC(f32, s8, 8, #else CVTS_FUNC(f32, s8, 8, float32x4_t vscale = vdupq_n_f32((f32)alpha); - float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, + float32x4_t vshift = vdupq_n_f32((f32)beta);, { for (size_t i = 0; i < w; i += 8) { @@ -2283,8 +2284,8 @@ CVTS_FUNC(f32, s8, 8, vline2_f32 = vmulq_f32(vline2_f32, vscale); vline1_f32 = vaddq_f32(vline1_f32, vshift); vline2_f32 = vaddq_f32(vline2_f32, vshift); - int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32); - int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32); + int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32); + int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32); int16x4_t vRes1 = vqmovn_s32(vline1_s32); int16x4_t vRes2 = vqmovn_s32(vline2_s32); int8x8_t vRes = vqmovn_s16(vcombine_s16(vRes1, vRes2)); @@ -2325,7 +2326,7 @@ CVTS_FUNC(f32, u16, 8, #else CVTS_FUNC(f32, u16, 8, float32x4_t vscale = vdupq_n_f32((f32)alpha); - float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, + float32x4_t vshift = vdupq_n_f32((f32)beta);, { for (size_t i = 0; i < w; i += 8) { @@ -2336,8 +2337,8 @@ CVTS_FUNC(f32, u16, 8, vline2_f32 = vmulq_f32(vline2_f32, vscale); vline1_f32 = vaddq_f32(vline1_f32, vshift); vline2_f32 = vaddq_f32(vline2_f32, vshift); - uint32x4_t vline1_u32 = vcvtq_u32_f32(vline1_f32); - uint32x4_t vline2_u32 = vcvtq_u32_f32(vline2_f32); + uint32x4_t vline1_u32 = internal::vroundq_u32_f32(vline1_f32); + uint32x4_t vline2_u32 = internal::vroundq_u32_f32(vline2_f32); uint16x4_t vRes1 = vqmovn_u32(vline1_u32); uint16x4_t vRes2 = vqmovn_u32(vline2_u32); vst1q_u16(_dst + i, vcombine_u16(vRes1, vRes2)); @@ -2377,7 +2378,7 @@ CVTS_FUNC(f32, s16, 8, #else CVTS_FUNC(f32, s16, 8, float32x4_t vscale = vdupq_n_f32((f32)alpha); - float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, + float32x4_t vshift = vdupq_n_f32((f32)beta);, { for (size_t i = 0; i < w; i += 8) { @@ -2388,8 +2389,8 @@ CVTS_FUNC(f32, s16, 8, vline2_f32 = vmulq_f32(vline2_f32, vscale); vline1_f32 = vaddq_f32(vline1_f32, vshift); vline2_f32 = vaddq_f32(vline2_f32, vshift); - int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32); - int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32); + int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32); + int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32); int16x4_t vRes1 = vqmovn_s32(vline1_s32); int16x4_t vRes2 = vqmovn_s32(vline2_s32); vst1q_s16(_dst + i, vcombine_s16(vRes1, vRes2)); @@ -2429,7 +2430,7 @@ CVTS_FUNC(f32, s32, 8, #else CVTS_FUNC(f32, s32, 8, float32x4_t vscale = vdupq_n_f32((f32)alpha); - float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, + float32x4_t vshift = vdupq_n_f32((f32)beta);, { for (size_t i = 0; i < w; i += 8) { @@ -2440,8 +2441,8 @@ CVTS_FUNC(f32, s32, 8, vline2_f32 = vmulq_f32(vline2_f32, vscale); vline1_f32 = vaddq_f32(vline1_f32, vshift); vline2_f32 = vaddq_f32(vline2_f32, vshift); - int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32); - int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32); + int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32); + int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32); vst1q_s32(_dst + i + 0, vline1_s32); vst1q_s32(_dst + i + 4, vline2_s32); } diff --git a/3rdparty/carotene/src/div.cpp b/3rdparty/carotene/src/div.cpp index 6a65ad3ce0..75502c736b 100644 --- a/3rdparty/carotene/src/div.cpp +++ b/3rdparty/carotene/src/div.cpp @@ -39,6 +39,7 @@ #include "common.hpp" #include "vtransform.hpp" +#include "vround_helper.hpp" #include #include @@ -51,13 +52,6 @@ namespace { #ifdef CAROTENE_NEON -inline float32x4_t vroundq(const float32x4_t& v) -{ - const int32x4_t signMask = vdupq_n_s32(1 << 31), half = vreinterpretq_s32_f32(vdupq_n_f32(0.5f)); - float32x4_t v_addition = vreinterpretq_f32_s32(vorrq_s32(half, vandq_s32(signMask, vreinterpretq_s32_f32(v)))); - return vaddq_f32(v, v_addition); -} - template inline T divSaturateQ(const T &v1, const T &v2, const float scale) { @@ -69,17 +63,10 @@ inline T divSaturateQ(const T &v1, const T &v2, const float scale) } template <> inline int32x4_t divSaturateQ(const int32x4_t &v1, const int32x4_t &v2, const float scale) -{ return vcvtq_s32_f32(vroundq(vmulq_f32(vmulq_n_f32(vcvtq_f32_s32(v1), scale), internal::vrecpq_f32(vcvtq_f32_s32(v2))))); } +{ return internal::vroundq_s32_f32(vmulq_f32(vmulq_n_f32(vcvtq_f32_s32(v1), scale), internal::vrecpq_f32(vcvtq_f32_s32(v2)))); } template <> inline uint32x4_t divSaturateQ(const uint32x4_t &v1, const uint32x4_t &v2, const float scale) -{ return vcvtq_u32_f32(vroundq(vmulq_f32(vmulq_n_f32(vcvtq_f32_u32(v1), scale), internal::vrecpq_f32(vcvtq_f32_u32(v2))))); } - -inline float32x2_t vround(const float32x2_t& v) -{ - const int32x2_t signMask = vdup_n_s32(1 << 31), half = vreinterpret_s32_f32(vdup_n_f32(0.5f)); - float32x2_t v_addition = vreinterpret_f32_s32(vorr_s32(half, vand_s32(signMask, vreinterpret_s32_f32(v)))); - return vadd_f32(v, v_addition); -} +{ return internal::vroundq_u32_f32(vmulq_f32(vmulq_n_f32(vcvtq_f32_u32(v1), scale), internal::vrecpq_f32(vcvtq_f32_u32(v2)))); } template inline T divSaturate(const T &v1, const T &v2, const float scale) @@ -88,10 +75,10 @@ inline T divSaturate(const T &v1, const T &v2, const float scale) } template <> inline int32x2_t divSaturate(const int32x2_t &v1, const int32x2_t &v2, const float scale) -{ return vcvt_s32_f32(vround(vmul_f32(vmul_n_f32(vcvt_f32_s32(v1), scale), internal::vrecp_f32(vcvt_f32_s32(v2))))); } +{ return internal::vround_s32_f32(vmul_f32(vmul_n_f32(vcvt_f32_s32(v1), scale), internal::vrecp_f32(vcvt_f32_s32(v2)))); } template <> inline uint32x2_t divSaturate(const uint32x2_t &v1, const uint32x2_t &v2, const float scale) -{ return vcvt_u32_f32(vround(vmul_f32(vmul_n_f32(vcvt_f32_u32(v1), scale), internal::vrecp_f32(vcvt_f32_u32(v2))))); } +{ return internal::vround_u32_f32(vmul_f32(vmul_n_f32(vcvt_f32_u32(v1), scale), internal::vrecp_f32(vcvt_f32_u32(v2)))); } template diff --git a/3rdparty/carotene/src/phase.cpp b/3rdparty/carotene/src/phase.cpp index 141b1e864a..48dea2a860 100644 --- a/3rdparty/carotene/src/phase.cpp +++ b/3rdparty/carotene/src/phase.cpp @@ -41,6 +41,7 @@ #include #include "common.hpp" +#include "vround_helper.hpp" namespace CAROTENE_NS { @@ -121,8 +122,6 @@ void phase(const Size2D &size, size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; - float32x4_t v_05 = vdupq_n_f32(0.5f); - for (size_t i = 0; i < size.height; ++i) { const s16 * src0 = internal::getRowPtr(src0Base, src0Stride, i); @@ -149,8 +148,8 @@ void phase(const Size2D &size, float32x4_t v_dst32f1; FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1) - uint16x8_t v_dst16s0 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f0, v_05))), - vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f1, v_05)))); + uint16x8_t v_dst16s0 = vcombine_u16(vmovn_u32(internal::vroundq_u32_f32(v_dst32f0)), + vmovn_u32(internal::vroundq_u32_f32(v_dst32f1))); // 1 v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src01))); @@ -161,8 +160,8 @@ void phase(const Size2D &size, v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src11))); FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1) - uint16x8_t v_dst16s1 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f0, v_05))), - vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f1, v_05)))); + uint16x8_t v_dst16s1 = vcombine_u16(vmovn_u32(internal::vroundq_u32_f32(v_dst32f0)), + vmovn_u32(internal::vroundq_u32_f32(v_dst32f1))); vst1q_u8(dst + j, vcombine_u8(vmovn_u16(v_dst16s0), vmovn_u16(v_dst16s1))); @@ -182,8 +181,8 @@ void phase(const Size2D &size, float32x4_t v_dst32f1; FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1) - uint16x8_t v_dst = vcombine_u16(vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f0, v_05))), - vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f1, v_05)))); + uint16x8_t v_dst = vcombine_u16(vmovn_u32(internal::vroundq_u32_f32(v_dst32f0)), + vmovn_u32(internal::vroundq_u32_f32(v_dst32f1))); vst1_u8(dst + j, vmovn_u16(v_dst)); } diff --git a/3rdparty/carotene/src/vround_helper.hpp b/3rdparty/carotene/src/vround_helper.hpp new file mode 100644 index 0000000000..89a6254510 --- /dev/null +++ b/3rdparty/carotene/src/vround_helper.hpp @@ -0,0 +1,102 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#ifndef CAROTENE_SRC_VROUND_HELPER_HPP +#define CAROTENE_SRC_VROUND_HELPER_HPP + +#include "common.hpp" +#include "vtransform.hpp" + +#ifdef CAROTENE_NEON + +/** + * This helper header is for rounding from float32xN to uin32xN or int32xN to nearest, ties to even. + * See https://en.wikipedia.org/wiki/Rounding#Rounding_half_to_even + */ + +// See https://github.com/opencv/opencv/pull/24271#issuecomment-1867318007 +#define CAROTENE_ROUND_DELTA (12582912.0f) + +namespace CAROTENE_NS { namespace internal { + +inline uint32x4_t vroundq_u32_f32(const float32x4_t val) +{ +#if CAROTENE_NEON_ARCH >= 8 /* get ready for ARMv9 */ + return vcvtnq_u32_f32(val); +#else + const float32x4_t delta = vdupq_n_f32(CAROTENE_ROUND_DELTA); + return vcvtq_u32_f32(vsubq_f32(vaddq_f32(val, delta), delta)); +#endif +} + +inline uint32x2_t vround_u32_f32(const float32x2_t val) +{ +#if CAROTENE_NEON_ARCH >= 8 /* get ready for ARMv9 */ + return vcvtn_u32_f32(val); +#else + const float32x2_t delta = vdup_n_f32(CAROTENE_ROUND_DELTA); + return vcvt_u32_f32(vsub_f32(vadd_f32(val, delta), delta)); +#endif +} + +inline int32x4_t vroundq_s32_f32(const float32x4_t val) +{ +#if CAROTENE_NEON_ARCH >= 8 /* get ready for ARMv9 */ + return vcvtnq_s32_f32(val); +#else + const float32x4_t delta = vdupq_n_f32(CAROTENE_ROUND_DELTA); + return vcvtq_s32_f32(vsubq_f32(vaddq_f32(val, delta), delta)); +#endif +} + +inline int32x2_t vround_s32_f32(const float32x2_t val) +{ +#if CAROTENE_NEON_ARCH >= 8 /* get ready for ARMv9 */ + return vcvtn_s32_f32(val); +#else + const float32x2_t delta = vdup_n_f32(CAROTENE_ROUND_DELTA); + return vcvt_s32_f32(vsub_f32(vadd_f32(val, delta), delta)); +#endif +} + +} } + +#endif // CAROTENE_NEON + +#endif diff --git a/CMakeLists.txt b/CMakeLists.txt index ad34423ea9..72b5aa62a9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -915,7 +915,15 @@ foreach(hal ${OpenCV_HAL}) if(";${CPU_BASELINE_FINAL};" MATCHES ";NEON;") add_subdirectory(3rdparty/carotene/hal) ocv_hal_register(CAROTENE_HAL_LIBRARIES CAROTENE_HAL_HEADERS CAROTENE_HAL_INCLUDE_DIRS) - list(APPEND OpenCV_USED_HAL "carotene (ver ${CAROTENE_HAL_VERSION})") + + if( NOT DEFINED CAROTENE_NEON_ARCH) + set(CAROTENE_NEON_MSG "Auto detected") + elseif( CAROTENE_NEON_ARCH GREATER 7) + set(CAROTENE_NEON_MSG "Force ARMv8+") + else() + set(CAROTENE_NEON_MSG "Force ARMv7") + endif() + list(APPEND OpenCV_USED_HAL "carotene (ver ${CAROTENE_HAL_VERSION}, ${CAROTENE_NEON_MSG})") else() message(STATUS "Carotene: NEON is not available, disabling carotene...") endif() diff --git a/doc/tutorials/introduction/config_reference/config_reference.markdown b/doc/tutorials/introduction/config_reference/config_reference.markdown index 4fd256dd93..dba280485f 100644 --- a/doc/tutorials/introduction/config_reference/config_reference.markdown +++ b/doc/tutorials/introduction/config_reference/config_reference.markdown @@ -585,6 +585,7 @@ Following options can be used to change installation layout for common scenarios | `BUILD_FAT_JAVA_LIB` | _ON_ (for static Android builds) | Build single _opencv_java_ dynamic library containing all library functionality bundled with Java bindings. | | `BUILD_opencv_python2` | _ON_ | Build python2 bindings (deprecated). Python with development files and numpy must be installed. | | `BUILD_opencv_python3` | _ON_ | Build python3 bindings. Python with development files and numpy must be installed. | +| `CAROTENE_NEON_ARCH` | '(auto)' | Switch NEON Arch for Carotene. If it sets nothing, it will be auto-detected. If it sets 8, ARMv8(and later) is used. Otherwise, ARMv7 is used. | TODO: need separate tutorials covering bindings builds diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp index ee9934135a..d0f645c24c 100644 --- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp @@ -1990,11 +1990,9 @@ inline v_int32x4 v_round(const v_float32x4& a) #else inline v_int32x4 v_round(const v_float32x4& a) { - static const int32x4_t v_sign = vdupq_n_s32(1 << 31), - v_05 = vreinterpretq_s32_f32(vdupq_n_f32(0.5f)); - - int32x4_t v_addition = vorrq_s32(v_05, vandq_s32(v_sign, vreinterpretq_s32_f32(a.val))); - return v_int32x4(vcvtq_s32_f32(vaddq_f32(a.val, vreinterpretq_f32_s32(v_addition)))); + // See https://github.com/opencv/opencv/pull/24271#issuecomment-1867318007 + float32x4_t delta = vdupq_n_f32(12582912.0f); + return v_int32x4(vcvtq_s32_f32(vsubq_f32(vaddq_f32(a.val, delta), delta))); } #endif inline v_int32x4 v_floor(const v_float32x4& a) diff --git a/modules/core/test/test_operations.cpp b/modules/core/test/test_operations.cpp index 5158e65bda..d985a1c2b6 100644 --- a/modules/core/test/test_operations.cpp +++ b/modules/core/test/test_operations.cpp @@ -1570,4 +1570,54 @@ TEST(Core_Arithm, scalar_handling_19599) // https://github.com/opencv/opencv/is EXPECT_EQ(1, c.rows); } +// https://github.com/opencv/opencv/issues/24163 +typedef tuple Arith_Regression24163Param; +typedef testing::TestWithParam Core_Arith_Regression24163; + +#if defined __riscv +TEST_P(Core_Arith_Regression24163, DISABLED_test_for_ties_to_even) +#else +TEST_P(Core_Arith_Regression24163, test_for_ties_to_even) +#endif +{ + const int matDepth = get<0>(GetParam()); + const int matHeight= get<1>(GetParam()); + const int matWidth = 3; // Fixed + const int alpha = get<2>(GetParam()); + const int beta = get<3>(GetParam()); + + // If alpha and/or beta are negative, and matDepth is unsigned, test is passed. + if( ( (alpha < 0) || (beta < 0) ) + && + ( (matDepth != CV_8S) && (matDepth != CV_16S) && (matDepth != CV_32S) ) ) + { + throw SkipTestException( cv::format("Test is skipped(matDepth is not signed, alpha = %d, beta = %d)", alpha, beta) ); + } + + const int matType = CV_MAKE_TYPE(matDepth, 1); + const Size matSize(matWidth, matHeight); + const Mat src1(matSize, matType, Scalar(alpha,alpha,alpha,alpha)); + const Mat src2(matSize, matType, Scalar(beta, beta, beta, beta)); + const Mat result = ( src1 + src2 ) / 2; + + // Expected that default is FE_TONEAREST(Ties to Even). + const int mean = lrint( static_cast(alpha + beta) / 2.0 ); + const Mat expected(matSize, matType, Scalar(mean,mean,mean,mean)); + + // Compare result and extected. + ASSERT_EQ(expected.size(), result.size()); + EXPECT_EQ(0, cvtest::norm(expected, result, NORM_INF)) << + "result=" << std::endl << result << std::endl << + "expected=" << std::endl << expected; +} + +INSTANTIATE_TEST_CASE_P(/* */, Core_Arith_Regression24163, + testing::Combine( + testing::Values(perf::MatDepth(CV_8U), CV_8S, CV_16U, CV_16S, CV_32S), // MatType + testing::Values( 3, 4, 5, 6), // MatHeight + testing::Values(-2,-1, 0, 1, 2), // src1 + testing::Values( -1, 0, 1 ) // src2 + ) +); + }} // namespace