Merge pull request #24271 from Kumataro:fix24163

Fix to convert float32 to int32/uint32 with rounding to nearest (ties to even). #24271 Fix https://github.com/opencv/opencv/issues/24163 ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [x] There is a reference to the original bug report and related work - [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [x] The feature is well documented and sample code can be built with the project CMake (carotene is BSD)
2025-06-07 17:44:04 +08:00 · 2023-12-25 18:17:17 +09:00 · 2023-12-25 18:17:17 +09:00 · dba7186378
commit dba7186378
parent d9d402916a
13 changed files with 323 additions and 164 deletions
--- a/3rdparty/carotene/CMakeLists.txt
+++ b/3rdparty/carotene/CMakeLists.txt
@ -42,6 +42,14 @@ endif()
 if(WITH_NEON)
    target_compile_definitions(carotene_objs PRIVATE "-DWITH_NEON")
    if(NOT DEFINED CAROTENE_NEON_ARCH )
    elseif(CAROTENE_NEON_ARCH EQUAL 8)
 	    target_compile_definitions(carotene_objs PRIVATE "-DCAROTENE_NEON_ARCH=8")
    elseif(CAROTENE_NEON_ARCH EQUAL 7)
 	    target_compile_definitions(carotene_objs PRIVATE "-DCAROTENE_NEON_ARCH=7")
    else()
 	    target_compile_definitions(carotene_objs PRIVATE "-DCAROTENE_NEON_ARCH=0")
    endif()
 endif()
 # we add dummy file to fix XCode build
--- a/3rdparty/carotene/src/add_weighted.cpp
+++ b/3rdparty/carotene/src/add_weighted.cpp
@ -39,6 +39,7 @@
 #include "common.hpp"
 #include "vtransform.hpp"
 #include "vround_helper.hpp"
 namespace CAROTENE_NS {
@ -106,7 +107,7 @@ template <> struct wAdd<s32>
    {
        valpha = vdupq_n_f32(_alpha);
        vbeta = vdupq_n_f32(_beta);
-        vgamma = vdupq_n_f32(_gamma + 0.5);
+        vgamma = vdupq_n_f32(_gamma);
    }
    void operator() (const VecTraits<s32>::vec128 & v_src0,
@ -118,7 +119,7 @@ template <> struct wAdd<s32>
        vs1 = vmlaq_f32(vgamma, vs1, valpha);
        vs1 = vmlaq_f32(vs1, vs2, vbeta);
-        v_dst = vcvtq_s32_f32(vs1);
+        v_dst = vroundq_s32_f32(vs1);
    }
    void operator() (const VecTraits<s32>::vec64 & v_src0,
@ -130,7 +131,7 @@ template <> struct wAdd<s32>
        vs1 = vmla_f32(vget_low(vgamma), vs1, vget_low(valpha));
        vs1 = vmla_f32(vs1, vs2, vget_low(vbeta));
-        v_dst = vcvt_s32_f32(vs1);
+        v_dst = vround_s32_f32(vs1);
    }
    void operator() (const s32 * src0, const s32 * src1, s32 * dst) const
@ -150,7 +151,7 @@ template <> struct wAdd<u32>
    {
        valpha = vdupq_n_f32(_alpha);
        vbeta = vdupq_n_f32(_beta);
-        vgamma = vdupq_n_f32(_gamma + 0.5);
+        vgamma = vdupq_n_f32(_gamma);
    }
    void operator() (const VecTraits<u32>::vec128 & v_src0,
@ -162,7 +163,7 @@ template <> struct wAdd<u32>
        vs1 = vmlaq_f32(vgamma, vs1, valpha);
        vs1 = vmlaq_f32(vs1, vs2, vbeta);
-        v_dst = vcvtq_u32_f32(vs1);
+        v_dst = vroundq_u32_f32(vs1);
    }
    void operator() (const VecTraits<u32>::vec64 & v_src0,
@ -174,7 +175,7 @@ template <> struct wAdd<u32>
        vs1 = vmla_f32(vget_low(vgamma), vs1, vget_low(valpha));
        vs1 = vmla_f32(vs1, vs2, vget_low(vbeta));
-        v_dst = vcvt_u32_f32(vs1);
+        v_dst = vround_u32_f32(vs1);
    }
    void operator() (const u32 * src0, const u32 * src1, u32 * dst) const
--- a/3rdparty/carotene/src/blur.cpp
+++ b/3rdparty/carotene/src/blur.cpp
@ -41,6 +41,7 @@
 #include "common.hpp"
 #include "saturate_cast.hpp"
 #include "vround_helper.hpp"
 namespace CAROTENE_NS {
@ -198,7 +199,6 @@ void blur3x3(const Size2D &size, s32 cn,
 //#define FLOAT_VARIANT_1_9
 #ifdef FLOAT_VARIANT_1_9
    float32x4_t v1_9 = vdupq_n_f32 (1.0/9.0);
    float32x4_t v0_5 = vdupq_n_f32 (.5);
 #else
    const int16x8_t vScale = vmovq_n_s16(3640);
 #endif
@ -283,8 +283,8 @@ void blur3x3(const Size2D &size, s32 cn,
                uint32x4_t tres2 = vmovl_u16(vget_high_u16(t0));
                float32x4_t vf1 = vmulq_f32(v1_9, vcvtq_f32_u32(tres1));
                float32x4_t vf2 = vmulq_f32(v1_9, vcvtq_f32_u32(tres2));
-                tres1 = vcvtq_u32_f32(vaddq_f32(vf1, v0_5));
+                tres1 = internal::vroundq_u32_f32(vf1);
-                tres2 = vcvtq_u32_f32(vaddq_f32(vf2, v0_5));
+                tres2 = internal::vroundq_u32_f32(vf2);
                t0 = vcombine_u16(vmovn_u32(tres1),vmovn_u32(tres2));
                vst1_u8(drow + x - 8, vmovn_u16(t0));
 #else
@ -445,8 +445,8 @@ void blur3x3(const Size2D &size, s32 cn,
                uint32x4_t tres2 = vmovl_u16(vget_high_u16(t0));
                float32x4_t vf1 = vmulq_f32(v1_9, vcvtq_f32_u32(tres1));
                float32x4_t vf2 = vmulq_f32(v1_9, vcvtq_f32_u32(tres2));
-                tres1 = vcvtq_u32_f32(vaddq_f32(vf1, v0_5));
+                tres1 = internal::vroundq_u32_f32(vf1);
-                tres2 = vcvtq_u32_f32(vaddq_f32(vf2, v0_5));
+                tres2 = internal::vroundq_u32_f32(vf2);
                t0 = vcombine_u16(vmovn_u32(tres1),vmovn_u32(tres2));
                vst1_u8(drow + x - 8, vmovn_u16(t0));
 #else
@ -508,7 +508,6 @@ void blur5x5(const Size2D &size, s32 cn,
 #define FLOAT_VARIANT_1_25
 #ifdef FLOAT_VARIANT_1_25
    float32x4_t v1_25 = vdupq_n_f32 (1.0f/25.0f);
    float32x4_t v0_5 = vdupq_n_f32 (.5f);
 #else
    const int16x8_t vScale = vmovq_n_s16(1310);
 #endif
@ -752,8 +751,8 @@ void blur5x5(const Size2D &size, s32 cn,
            uint32x4_t tres2 = vmovl_u16(vget_high_u16(t0));
            float32x4_t vf1 = vmulq_f32(v1_25, vcvtq_f32_u32(tres1));
            float32x4_t vf2 = vmulq_f32(v1_25, vcvtq_f32_u32(tres2));
-            tres1 = vcvtq_u32_f32(vaddq_f32(vf1, v0_5));
+            tres1 = internal::vroundq_u32_f32(vf1);
-            tres2 = vcvtq_u32_f32(vaddq_f32(vf2, v0_5));
+            tres2 = internal::vroundq_u32_f32(vf2);
            t0 = vcombine_u16(vmovn_u32(tres1),vmovn_u32(tres2));
            vst1_u8(drow + x - 8, vmovn_u16(t0));
 #else
--- a/3rdparty/carotene/src/colorconvert.cpp
+++ b/3rdparty/carotene/src/colorconvert.cpp
@ -40,6 +40,7 @@
 #include "common.hpp"
 #include "saturate_cast.hpp"
 #include "vround_helper.hpp"
 namespace CAROTENE_NS {
@ -1166,17 +1167,10 @@ inline uint8x8x3_t convertToHSV(const uint8x8_t vR, const uint8x8_t vG, const ui
    vSt3 = vmulq_f32(vHF1, vDivTab);
    vSt4 = vmulq_f32(vHF2, vDivTab);
-    float32x4_t bias = vdupq_n_f32(0.5f);
+    uint32x4_t vRes1 = internal::vroundq_u32_f32(vSt1);
-
+    uint32x4_t vRes2 = internal::vroundq_u32_f32(vSt2);
-    vSt1 = vaddq_f32(vSt1, bias);
+    uint32x4_t vRes3 = internal::vroundq_u32_f32(vSt3);
-    vSt2 = vaddq_f32(vSt2, bias);
+    uint32x4_t vRes4 = internal::vroundq_u32_f32(vSt4);
    vSt3 = vaddq_f32(vSt3, bias);
    vSt4 = vaddq_f32(vSt4, bias);
    uint32x4_t vRes1 = vcvtq_u32_f32(vSt1);
    uint32x4_t vRes2 = vcvtq_u32_f32(vSt2);
    uint32x4_t vRes3 = vcvtq_u32_f32(vSt3);
    uint32x4_t vRes4 = vcvtq_u32_f32(vSt4);
    int32x4_t vH_L = vmovl_s16(vget_low_s16(vDiff4));
    int32x4_t vH_H = vmovl_s16(vget_high_s16(vDiff4));
--- a/3rdparty/carotene/src/common.hpp
+++ b/3rdparty/carotene/src/common.hpp
@ -58,6 +58,17 @@
 namespace CAROTENE_NS { namespace internal {
 #ifndef CAROTENE_NEON_ARCH
 #    if defined(__aarch64__) || defined(__aarch32__)
 #        define CAROTENE_NEON_ARCH 8
 #    else
 #        define CAROTENE_NEON_ARCH 7
 #    endif
 #endif
 #if ( !defined(__aarch64__) && !defined(__aarch32__) ) && (CAROTENE_NEON_ARCH == 8 )
 #    error("ARMv7 doen't support A32/A64 Neon instructions")
 #endif
 inline void prefetch(const void *ptr, size_t offset = 32*10)
 {
 #if defined __GNUC__
--- a/3rdparty/carotene/src/convert_scale.cpp
+++ b/3rdparty/carotene/src/convert_scale.cpp
@ -38,6 +38,7 @@
 */
 #include "common.hpp"
 #include "vround_helper.hpp"
 namespace CAROTENE_NS {
@ -185,7 +186,7 @@ CVTS_FUNC1(u8, 16,
 #else
 CVTS_FUNC1(u8, 16,
    float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
    for (size_t i = 0; i < w; i += 16)
    {
@ -209,10 +210,10 @@ CVTS_FUNC1(u8, 16,
        vline2_f32 = vaddq_f32(vline2_f32, vshift);
        vline3_f32 = vaddq_f32(vline3_f32, vshift);
        vline4_f32 = vaddq_f32(vline4_f32, vshift);
-        int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
-        int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
-        int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32);
+        int32x4_t vline3_s32 = internal::vroundq_s32_f32(vline3_f32);
-        int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32);
+        int32x4_t vline4_s32 = internal::vroundq_s32_f32(vline4_f32);
        uint16x8_t vRes1_u16 = vcombine_u16(vqmovun_s32(vline1_s32), vqmovun_s32(vline2_s32));
        uint16x8_t vRes2_u16 = vcombine_u16(vqmovun_s32(vline3_s32), vqmovun_s32(vline4_s32));
        vst1q_u8(_dst + i, vcombine_u8(vqmovn_u16(vRes1_u16), vqmovn_u16(vRes2_u16)));
@ -270,7 +271,7 @@ CVTS_FUNC(u8, s8, 16,
 #else
 CVTS_FUNC(u8, s8, 16,
    float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
    for (size_t i = 0; i < w; i += 16)
    {
@ -294,10 +295,10 @@ CVTS_FUNC(u8, s8, 16,
        vline2_f32 = vaddq_f32(vline2_f32, vshift);
        vline3_f32 = vaddq_f32(vline3_f32, vshift);
        vline4_f32 = vaddq_f32(vline4_f32, vshift);
-        int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
-        int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
-        int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32);
+        int32x4_t vline3_s32 = internal::vroundq_s32_f32(vline3_f32);
-        int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32);
+        int32x4_t vline4_s32 = internal::vroundq_s32_f32(vline4_f32);
        int16x8_t vRes1_u16 = vcombine_s16(vqmovn_s32(vline1_s32), vqmovn_s32(vline2_s32));
        int16x8_t vRes2_u16 = vcombine_s16(vqmovn_s32(vline3_s32), vqmovn_s32(vline4_s32));
        vst1q_s8(_dst + i, vcombine_s8(vqmovn_s16(vRes1_u16), vqmovn_s16(vRes2_u16)));
@ -355,7 +356,7 @@ CVTS_FUNC(u8, u16, 16,
 #else
 CVTS_FUNC(u8, u16, 16,
    float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
    for (size_t i = 0; i < w; i += 16)
    {
@ -379,10 +380,10 @@ CVTS_FUNC(u8, u16, 16,
        vline2_f32 = vaddq_f32(vline2_f32, vshift);
        vline3_f32 = vaddq_f32(vline3_f32, vshift);
        vline4_f32 = vaddq_f32(vline4_f32, vshift);
-        int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
-        int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
-        int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32);
+        int32x4_t vline3_s32 = internal::vroundq_s32_f32(vline3_f32);
-        int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32);
+        int32x4_t vline4_s32 = internal::vroundq_s32_f32(vline4_f32);
        vst1q_u16(_dst + i + 0, vcombine_u16(vqmovun_s32(vline1_s32), vqmovun_s32(vline2_s32)));
        vst1q_u16(_dst + i + 8, vcombine_u16(vqmovun_s32(vline3_s32), vqmovun_s32(vline4_s32)));
    }
@ -439,7 +440,7 @@ CVTS_FUNC(u8, s16, 16,
 #else
 CVTS_FUNC(u8, s16, 16,
    float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
    for (size_t i = 0; i < w; i += 16)
    {
@ -463,10 +464,10 @@ CVTS_FUNC(u8, s16, 16,
        vline2_f32 = vaddq_f32(vline2_f32, vshift);
        vline3_f32 = vaddq_f32(vline3_f32, vshift);
        vline4_f32 = vaddq_f32(vline4_f32, vshift);
-        int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
-        int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
-        int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32);
+        int32x4_t vline3_s32 = internal::vroundq_s32_f32(vline3_f32);
-        int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32);
+        int32x4_t vline4_s32 = internal::vroundq_s32_f32(vline4_f32);
        vst1q_s16(_dst + i + 0, vcombine_s16(vqmovn_s32(vline1_s32), vqmovn_s32(vline2_s32)));
        vst1q_s16(_dst + i + 8, vcombine_s16(vqmovn_s32(vline3_s32), vqmovn_s32(vline4_s32)));
    }
@ -526,7 +527,7 @@ CVTS_FUNC(u8, s32, 16,
 #else
 CVTS_FUNC(u8, s32, 16,
    float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
    for (size_t i = 0; i < w; i += 16)
    {
@ -550,10 +551,10 @@ CVTS_FUNC(u8, s32, 16,
        vline2_f32 = vaddq_f32(vline2_f32, vshift);
        vline3_f32 = vaddq_f32(vline3_f32, vshift);
        vline4_f32 = vaddq_f32(vline4_f32, vshift);
-        int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
-        int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
-        int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32);
+        int32x4_t vline3_s32 = internal::vroundq_s32_f32(vline3_f32);
-        int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32);
+        int32x4_t vline4_s32 = internal::vroundq_s32_f32(vline4_f32);
        vst1q_s32(_dst + i + 0,  vline1_s32);
        vst1q_s32(_dst + i + 4,  vline2_s32);
        vst1q_s32(_dst + i + 8,  vline3_s32);
@ -693,7 +694,7 @@ CVTS_FUNC(s8, u8, 16,
 #else
 CVTS_FUNC(s8, u8, 16,
    float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
    for (size_t i = 0; i < w; i += 16)
    {
@ -717,10 +718,10 @@ CVTS_FUNC(s8, u8, 16,
        vline2_f32 = vaddq_f32(vline2_f32, vshift);
        vline3_f32 = vaddq_f32(vline3_f32, vshift);
        vline4_f32 = vaddq_f32(vline4_f32, vshift);
-        vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
-        vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
-        vline3_s32 = vcvtq_s32_f32(vline3_f32);
+        vline3_s32 = internal::vroundq_s32_f32(vline3_f32);
-        vline4_s32 = vcvtq_s32_f32(vline4_f32);
+        vline4_s32 = internal::vroundq_s32_f32(vline4_f32);
        uint16x8_t vRes1_u16 = vcombine_u16(vqmovun_s32(vline1_s32), vqmovun_s32(vline2_s32));
        uint16x8_t vRes2_u16 = vcombine_u16(vqmovun_s32(vline3_s32), vqmovun_s32(vline4_s32));
        vst1q_u8(_dst + i, vcombine_u8(vqmovn_u16(vRes1_u16), vqmovn_u16(vRes2_u16)));
@ -778,7 +779,7 @@ CVTS_FUNC1(s8, 16,
 #else
 CVTS_FUNC1(s8, 16,
    float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
    for (size_t i = 0; i < w; i += 16)
    {
@ -802,10 +803,10 @@ CVTS_FUNC1(s8, 16,
        vline2_f32 = vaddq_f32(vline2_f32, vshift);
        vline3_f32 = vaddq_f32(vline3_f32, vshift);
        vline4_f32 = vaddq_f32(vline4_f32, vshift);
-        vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
-        vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
-        vline3_s32 = vcvtq_s32_f32(vline3_f32);
+        vline3_s32 = internal::vroundq_s32_f32(vline3_f32);
-        vline4_s32 = vcvtq_s32_f32(vline4_f32);
+        vline4_s32 = internal::vroundq_s32_f32(vline4_f32);
        int16x8_t vRes1_s16 = vcombine_s16(vqmovn_s32(vline1_s32), vqmovn_s32(vline2_s32));
        int16x8_t vRes2_s16 = vcombine_s16(vqmovn_s32(vline3_s32), vqmovn_s32(vline4_s32));
        vst1q_s8(_dst + i, vcombine_s8(vqmovn_s16(vRes1_s16), vqmovn_s16(vRes2_s16)));
@ -863,7 +864,7 @@ CVTS_FUNC(s8, u16, 16,
 #else
 CVTS_FUNC(s8, u16, 16,
    float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
    for (size_t i = 0; i < w; i += 16)
    {
@ -887,10 +888,10 @@ CVTS_FUNC(s8, u16, 16,
        vline2_f32 = vaddq_f32(vline2_f32, vshift);
        vline3_f32 = vaddq_f32(vline3_f32, vshift);
        vline4_f32 = vaddq_f32(vline4_f32, vshift);
-        vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
-        vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
-        vline3_s32 = vcvtq_s32_f32(vline3_f32);
+        vline3_s32 = internal::vroundq_s32_f32(vline3_f32);
-        vline4_s32 = vcvtq_s32_f32(vline4_f32);
+        vline4_s32 = internal::vroundq_s32_f32(vline4_f32);
        uint16x8_t vRes1_u16 = vcombine_u16(vqmovun_s32(vline1_s32), vqmovun_s32(vline2_s32));
        uint16x8_t vRes2_u16 = vcombine_u16(vqmovun_s32(vline3_s32), vqmovun_s32(vline4_s32));
        vst1q_u16(_dst + i + 0, vRes1_u16);
@ -949,7 +950,7 @@ CVTS_FUNC(s8, s16, 16,
 #else
 CVTS_FUNC(s8, s16, 16,
    float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
    for (size_t i = 0; i < w; i += 16)
    {
@ -973,10 +974,10 @@ CVTS_FUNC(s8, s16, 16,
        vline2_f32 = vaddq_f32(vline2_f32, vshift);
        vline3_f32 = vaddq_f32(vline3_f32, vshift);
        vline4_f32 = vaddq_f32(vline4_f32, vshift);
-        vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
-        vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
-        vline3_s32 = vcvtq_s32_f32(vline3_f32);
+        vline3_s32 = internal::vroundq_s32_f32(vline3_f32);
-        vline4_s32 = vcvtq_s32_f32(vline4_f32);
+        vline4_s32 = internal::vroundq_s32_f32(vline4_f32);
        int16x8_t vRes1_s16 = vcombine_s16(vqmovn_s32(vline1_s32), vqmovn_s32(vline2_s32));
        int16x8_t vRes2_s16 = vcombine_s16(vqmovn_s32(vline3_s32), vqmovn_s32(vline4_s32));
        vst1q_s16(_dst + i + 0, vRes1_s16);
@ -1038,7 +1039,7 @@ CVTS_FUNC(s8, s32, 16,
 #else
 CVTS_FUNC(s8, s32, 16,
    float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
    for (size_t i = 0; i < w; i += 16)
    {
@ -1062,10 +1063,10 @@ CVTS_FUNC(s8, s32, 16,
        vline2_f32 = vaddq_f32(vline2_f32, vshift);
        vline3_f32 = vaddq_f32(vline3_f32, vshift);
        vline4_f32 = vaddq_f32(vline4_f32, vshift);
-        vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
-        vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
-        vline3_s32 = vcvtq_s32_f32(vline3_f32);
+        vline3_s32 = internal::vroundq_s32_f32(vline3_f32);
-        vline4_s32 = vcvtq_s32_f32(vline4_f32);
+        vline4_s32 = internal::vroundq_s32_f32(vline4_f32);
        vst1q_s32(_dst + i + 0,  vline1_s32);
        vst1q_s32(_dst + i + 4,  vline2_s32);
        vst1q_s32(_dst + i + 8,  vline3_s32);
@ -1190,7 +1191,7 @@ CVTS_FUNC(u16, u8, 16,
 #else
 CVTS_FUNC(u16, u8, 16,
    float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
    for (size_t i = 0; i < w; i += 8)
    {
@ -1204,8 +1205,8 @@ CVTS_FUNC(u16, u8, 16,
        vline2_f32 = vmulq_f32(vline2_f32, vscale);
        vline1_f32 = vaddq_f32(vline1_f32, vshift);
        vline2_f32 = vaddq_f32(vline2_f32, vshift);
-        int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
-        int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
        int16x4_t vRes1 = vqmovn_s32(vline1_s32);
        int16x4_t vRes2 = vqmovn_s32(vline2_s32);
        uint8x8_t vRes = vqmovun_s16(vcombine_s16(vRes1, vRes2));
@ -1249,7 +1250,7 @@ CVTS_FUNC(u16, s8, 16,
 #else
 CVTS_FUNC(u16, s8, 16,
    float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
    for (size_t i = 0; i < w; i += 8)
    {
@ -1263,8 +1264,8 @@ CVTS_FUNC(u16, s8, 16,
        vline2_f32 = vmulq_f32(vline2_f32, vscale);
        vline1_f32 = vaddq_f32(vline1_f32, vshift);
        vline2_f32 = vaddq_f32(vline2_f32, vshift);
-        int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
-        int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
        int16x4_t vRes1 = vqmovn_s32(vline1_s32);
        int16x4_t vRes2 = vqmovn_s32(vline2_s32);
        int8x8_t vRes = vqmovn_s16(vcombine_s16(vRes1, vRes2));
@ -1307,7 +1308,7 @@ CVTS_FUNC1(u16, 16,
 #else
 CVTS_FUNC1(u16, 16,
    float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
    for (size_t i = 0; i < w; i += 8)
    {
@ -1321,8 +1322,8 @@ CVTS_FUNC1(u16, 16,
        vline2_f32 = vmulq_f32(vline2_f32, vscale);
        vline1_f32 = vaddq_f32(vline1_f32, vshift);
        vline2_f32 = vaddq_f32(vline2_f32, vshift);
-        int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
-        int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
        uint16x4_t vRes1 = vqmovun_s32(vline1_s32);
        uint16x4_t vRes2 = vqmovun_s32(vline2_s32);
        vst1q_u16(_dst + i, vcombine_u16(vRes1, vRes2));
@ -1364,7 +1365,7 @@ CVTS_FUNC(u16, s16, 8,
 #else
 CVTS_FUNC(u16, s16, 8,
    float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
    for (size_t i = 0; i < w; i += 8)
    {
@ -1378,8 +1379,8 @@ CVTS_FUNC(u16, s16, 8,
        vline2_f32 = vmulq_f32(vline2_f32, vscale);
        vline1_f32 = vaddq_f32(vline1_f32, vshift);
        vline2_f32 = vaddq_f32(vline2_f32, vshift);
-        int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
-        int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
        int16x4_t vRes1 = vqmovn_s32(vline1_s32);
        int16x4_t vRes2 = vqmovn_s32(vline2_s32);
        vst1q_s16(_dst + i, vcombine_s16(vRes1, vRes2));
@ -1421,7 +1422,7 @@ CVTS_FUNC(u16, s32, 8,
 #else
 CVTS_FUNC(u16, s32, 8,
    float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
    for (size_t i = 0; i < w; i += 8)
    {
@ -1435,8 +1436,8 @@ CVTS_FUNC(u16, s32, 8,
        vline2_f32 = vmulq_f32(vline2_f32, vscale);
        vline1_f32 = vaddq_f32(vline1_f32, vshift);
        vline2_f32 = vaddq_f32(vline2_f32, vshift);
-        int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
-        int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
        vst1q_s32(_dst + i + 0, vline1_s32);
        vst1q_s32(_dst + i + 4, vline2_s32);
    }
@ -1530,7 +1531,7 @@ CVTS_FUNC(s16, u8, 16,
 #else
 CVTS_FUNC(s16, u8, 16,
    float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
    for (size_t i = 0; i < w; i += 8)
    {
@ -1544,8 +1545,8 @@ CVTS_FUNC(s16, u8, 16,
        vline2_f32 = vmulq_f32(vline2_f32, vscale);
        vline1_f32 = vaddq_f32(vline1_f32, vshift);
        vline2_f32 = vaddq_f32(vline2_f32, vshift);
-        vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
-        vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
        int16x4_t vRes1 = vqmovn_s32(vline1_s32);
        int16x4_t vRes2 = vqmovn_s32(vline2_s32);
        uint8x8_t vRes = vqmovun_s16(vcombine_s16(vRes1, vRes2));
@ -1589,7 +1590,7 @@ CVTS_FUNC(s16, s8, 16,
 #else
 CVTS_FUNC(s16, s8, 16,
    float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
    for (size_t i = 0; i < w; i += 8)
    {
@ -1603,8 +1604,8 @@ CVTS_FUNC(s16, s8, 16,
        vline2_f32 = vmulq_f32(vline2_f32, vscale);
        vline1_f32 = vaddq_f32(vline1_f32, vshift);
        vline2_f32 = vaddq_f32(vline2_f32, vshift);
-        vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
-        vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
        int16x4_t vRes1 = vqmovn_s32(vline1_s32);
        int16x4_t vRes2 = vqmovn_s32(vline2_s32);
        int8x8_t vRes = vqmovn_s16(vcombine_s16(vRes1, vRes2));
@ -1647,7 +1648,7 @@ CVTS_FUNC(s16, u16, 8,
 #else
 CVTS_FUNC(s16, u16, 8,
    float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
    for (size_t i = 0; i < w; i += 8)
    {
@ -1661,8 +1662,8 @@ CVTS_FUNC(s16, u16, 8,
        vline2_f32 = vmulq_f32(vline2_f32, vscale);
        vline1_f32 = vaddq_f32(vline1_f32, vshift);
        vline2_f32 = vaddq_f32(vline2_f32, vshift);
-        vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
-        vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
        uint16x4_t vRes1 = vqmovun_s32(vline1_s32);
        uint16x4_t vRes2 = vqmovun_s32(vline2_s32);
        vst1q_u16(_dst + i, vcombine_u16(vRes1, vRes2));
@ -1704,7 +1705,7 @@ CVTS_FUNC1(s16, 16,
 #else
 CVTS_FUNC1(s16, 16,
    float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
    for (size_t i = 0; i < w; i += 8)
    {
@ -1718,8 +1719,8 @@ CVTS_FUNC1(s16, 16,
        vline2_f32 = vmulq_f32(vline2_f32, vscale);
        vline1_f32 = vaddq_f32(vline1_f32, vshift);
        vline2_f32 = vaddq_f32(vline2_f32, vshift);
-        vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
-        vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
        int16x4_t vRes1 = vqmovn_s32(vline1_s32);
        int16x4_t vRes2 = vqmovn_s32(vline2_s32);
        vst1q_s16(_dst + i, vcombine_s16(vRes1, vRes2));
@ -1761,7 +1762,7 @@ CVTS_FUNC(s16, s32, 8,
 #else
 CVTS_FUNC(s16, s32, 8,
    float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
    for (size_t i = 0; i < w; i += 8)
    {
@ -1775,8 +1776,8 @@ CVTS_FUNC(s16, s32, 8,
        vline2_f32 = vmulq_f32(vline2_f32, vscale);
        vline1_f32 = vaddq_f32(vline1_f32, vshift);
        vline2_f32 = vaddq_f32(vline2_f32, vshift);
-        vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
-        vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
        vst1q_s32(_dst + i + 0, vline1_s32);
        vst1q_s32(_dst + i + 4, vline2_s32);
    }
@ -1870,7 +1871,7 @@ CVTS_FUNC(s32, u8, 8,
 #else
 CVTS_FUNC(s32, u8, 8,
    float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
    for (size_t i = 0; i < w; i += 8)
    {
@ -1883,8 +1884,8 @@ CVTS_FUNC(s32, u8, 8,
        vline2_f32 = vmulq_f32(vline2_f32, vscale);
        vline1_f32 = vaddq_f32(vline1_f32, vshift);
        vline2_f32 = vaddq_f32(vline2_f32, vshift);
-        vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
-        vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
        uint16x4_t vRes1 = vqmovun_s32(vline1_s32);
        uint16x4_t vRes2 = vqmovun_s32(vline2_s32);
        uint8x8_t vRes = vqmovn_u16(vcombine_u16(vRes1, vRes2));
@ -1928,7 +1929,7 @@ CVTS_FUNC(s32, s8, 8,
 #else
 CVTS_FUNC(s32, s8, 8,
    float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
    for (size_t i = 0; i < w; i += 8)
    {
@ -1941,8 +1942,8 @@ CVTS_FUNC(s32, s8, 8,
        vline2_f32 = vmulq_f32(vline2_f32, vscale);
        vline1_f32 = vaddq_f32(vline1_f32, vshift);
        vline2_f32 = vaddq_f32(vline2_f32, vshift);
-        vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
-        vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
        int16x4_t vRes1 = vqmovn_s32(vline1_s32);
        int16x4_t vRes2 = vqmovn_s32(vline2_s32);
        int8x8_t vRes = vqmovn_s16(vcombine_s16(vRes1, vRes2));
@ -1985,7 +1986,7 @@ CVTS_FUNC(s32, u16, 8,
 #else
 CVTS_FUNC(s32, u16, 8,
    float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
    for (size_t i = 0; i < w; i += 8)
    {
@ -1998,8 +1999,8 @@ CVTS_FUNC(s32, u16, 8,
        vline2_f32 = vmulq_f32(vline2_f32, vscale);
        vline1_f32 = vaddq_f32(vline1_f32, vshift);
        vline2_f32 = vaddq_f32(vline2_f32, vshift);
-        vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
-        vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
        uint16x4_t vRes1 = vqmovun_s32(vline1_s32);
        uint16x4_t vRes2 = vqmovun_s32(vline2_s32);
        vst1q_u16(_dst + i, vcombine_u16(vRes1, vRes2));
@ -2041,7 +2042,7 @@ CVTS_FUNC(s32, s16, 8,
 #else
 CVTS_FUNC(s32, s16, 8,
    float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
    for (size_t i = 0; i < w; i += 8)
    {
@ -2054,8 +2055,8 @@ CVTS_FUNC(s32, s16, 8,
        vline2_f32 = vmulq_f32(vline2_f32, vscale);
        vline1_f32 = vaddq_f32(vline1_f32, vshift);
        vline2_f32 = vaddq_f32(vline2_f32, vshift);
-        vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
-        vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
        int16x4_t vRes1 = vqmovn_s32(vline1_s32);
        int16x4_t vRes2 = vqmovn_s32(vline2_s32);
        vst1q_s16(_dst + i, vcombine_s16(vRes1, vRes2));
@ -2097,7 +2098,7 @@ CVTS_FUNC1(s32, 8,
 #else
 CVTS_FUNC1(s32, 8,
    float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
    for (size_t i = 0; i < w; i += 8)
    {
@ -2110,8 +2111,8 @@ CVTS_FUNC1(s32, 8,
        vline2_f32 = vmulq_f32(vline2_f32, vscale);
        vline1_f32 = vaddq_f32(vline1_f32, vshift);
        vline2_f32 = vaddq_f32(vline2_f32, vshift);
-        vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
-        vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
        vst1q_s32(_dst + i + 0, vline1_s32);
        vst1q_s32(_dst + i + 4, vline2_s32);
    }
@ -2272,7 +2273,7 @@ CVTS_FUNC(f32, s8, 8,
 #else
 CVTS_FUNC(f32, s8, 8,
    float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
    for (size_t i = 0; i < w; i += 8)
    {
@ -2283,8 +2284,8 @@ CVTS_FUNC(f32, s8, 8,
        vline2_f32 = vmulq_f32(vline2_f32, vscale);
        vline1_f32 = vaddq_f32(vline1_f32, vshift);
        vline2_f32 = vaddq_f32(vline2_f32, vshift);
-        int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
-        int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
        int16x4_t vRes1 = vqmovn_s32(vline1_s32);
        int16x4_t vRes2 = vqmovn_s32(vline2_s32);
        int8x8_t vRes = vqmovn_s16(vcombine_s16(vRes1, vRes2));
@ -2325,7 +2326,7 @@ CVTS_FUNC(f32, u16, 8,
 #else
 CVTS_FUNC(f32, u16, 8,
    float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
    for (size_t i = 0; i < w; i += 8)
    {
@ -2336,8 +2337,8 @@ CVTS_FUNC(f32, u16, 8,
        vline2_f32 = vmulq_f32(vline2_f32, vscale);
        vline1_f32 = vaddq_f32(vline1_f32, vshift);
        vline2_f32 = vaddq_f32(vline2_f32, vshift);
-        uint32x4_t vline1_u32 = vcvtq_u32_f32(vline1_f32);
+        uint32x4_t vline1_u32 = internal::vroundq_u32_f32(vline1_f32);
-        uint32x4_t vline2_u32 = vcvtq_u32_f32(vline2_f32);
+        uint32x4_t vline2_u32 = internal::vroundq_u32_f32(vline2_f32);
        uint16x4_t vRes1 = vqmovn_u32(vline1_u32);
        uint16x4_t vRes2 = vqmovn_u32(vline2_u32);
        vst1q_u16(_dst + i, vcombine_u16(vRes1, vRes2));
@ -2377,7 +2378,7 @@ CVTS_FUNC(f32, s16, 8,
 #else
 CVTS_FUNC(f32, s16, 8,
    float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
    for (size_t i = 0; i < w; i += 8)
    {
@ -2388,8 +2389,8 @@ CVTS_FUNC(f32, s16, 8,
        vline2_f32 = vmulq_f32(vline2_f32, vscale);
        vline1_f32 = vaddq_f32(vline1_f32, vshift);
        vline2_f32 = vaddq_f32(vline2_f32, vshift);
-        int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
-        int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
        int16x4_t vRes1 = vqmovn_s32(vline1_s32);
        int16x4_t vRes2 = vqmovn_s32(vline2_s32);
        vst1q_s16(_dst + i, vcombine_s16(vRes1, vRes2));
@ -2429,7 +2430,7 @@ CVTS_FUNC(f32, s32, 8,
 #else
 CVTS_FUNC(f32, s32, 8,
    float32x4_t vscale = vdupq_n_f32((f32)alpha);
-    float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);,
+    float32x4_t vshift = vdupq_n_f32((f32)beta);,
 {
    for (size_t i = 0; i < w; i += 8)
    {
@ -2440,8 +2441,8 @@ CVTS_FUNC(f32, s32, 8,
        vline2_f32 = vmulq_f32(vline2_f32, vscale);
        vline1_f32 = vaddq_f32(vline1_f32, vshift);
        vline2_f32 = vaddq_f32(vline2_f32, vshift);
-        int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32);
+        int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
-        int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32);
+        int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
        vst1q_s32(_dst + i + 0, vline1_s32);
        vst1q_s32(_dst + i + 4, vline2_s32);
    }
--- a/3rdparty/carotene/src/div.cpp
+++ b/3rdparty/carotene/src/div.cpp
@ -39,6 +39,7 @@
 #include "common.hpp"
 #include "vtransform.hpp"
 #include "vround_helper.hpp"
 #include <cstring>
 #include <cfloat>
@ -51,13 +52,6 @@ namespace {
 #ifdef CAROTENE_NEON
 inline float32x4_t vroundq(const float32x4_t& v)
 {
    const int32x4_t signMask = vdupq_n_s32(1 << 31), half = vreinterpretq_s32_f32(vdupq_n_f32(0.5f));
    float32x4_t v_addition = vreinterpretq_f32_s32(vorrq_s32(half, vandq_s32(signMask, vreinterpretq_s32_f32(v))));
    return vaddq_f32(v, v_addition);
 }
 template <typename T>
 inline T divSaturateQ(const T &v1, const T &v2, const float scale)
 {
@ -69,17 +63,10 @@ inline T divSaturateQ(const T &v1, const T &v2, const float scale)
 }
 template <>
 inline int32x4_t divSaturateQ<int32x4_t>(const int32x4_t &v1, const int32x4_t &v2, const float scale)
-{ return vcvtq_s32_f32(vroundq(vmulq_f32(vmulq_n_f32(vcvtq_f32_s32(v1), scale), internal::vrecpq_f32(vcvtq_f32_s32(v2))))); }
+{ return internal::vroundq_s32_f32(vmulq_f32(vmulq_n_f32(vcvtq_f32_s32(v1), scale), internal::vrecpq_f32(vcvtq_f32_s32(v2)))); }
 template <>
 inline uint32x4_t divSaturateQ<uint32x4_t>(const uint32x4_t &v1, const uint32x4_t &v2, const float scale)
-{ return vcvtq_u32_f32(vroundq(vmulq_f32(vmulq_n_f32(vcvtq_f32_u32(v1), scale), internal::vrecpq_f32(vcvtq_f32_u32(v2))))); }
+{ return internal::vroundq_u32_f32(vmulq_f32(vmulq_n_f32(vcvtq_f32_u32(v1), scale), internal::vrecpq_f32(vcvtq_f32_u32(v2)))); }
 inline float32x2_t vround(const float32x2_t& v)
 {
    const int32x2_t signMask = vdup_n_s32(1 << 31), half = vreinterpret_s32_f32(vdup_n_f32(0.5f));
    float32x2_t v_addition = vreinterpret_f32_s32(vorr_s32(half, vand_s32(signMask, vreinterpret_s32_f32(v))));
    return vadd_f32(v, v_addition);
 }
 template <typename T>
 inline T divSaturate(const T &v1, const T &v2, const float scale)
@ -88,10 +75,10 @@ inline T divSaturate(const T &v1, const T &v2, const float scale)
 }
 template <>
 inline int32x2_t divSaturate<int32x2_t>(const int32x2_t &v1, const int32x2_t &v2, const float scale)
-{ return vcvt_s32_f32(vround(vmul_f32(vmul_n_f32(vcvt_f32_s32(v1), scale), internal::vrecp_f32(vcvt_f32_s32(v2))))); }
+{ return internal::vround_s32_f32(vmul_f32(vmul_n_f32(vcvt_f32_s32(v1), scale), internal::vrecp_f32(vcvt_f32_s32(v2)))); }
 template <>
 inline uint32x2_t divSaturate<uint32x2_t>(const uint32x2_t &v1, const uint32x2_t &v2, const float scale)
-{ return vcvt_u32_f32(vround(vmul_f32(vmul_n_f32(vcvt_f32_u32(v1), scale), internal::vrecp_f32(vcvt_f32_u32(v2))))); }
+{ return internal::vround_u32_f32(vmul_f32(vmul_n_f32(vcvt_f32_u32(v1), scale), internal::vrecp_f32(vcvt_f32_u32(v2)))); }
 template <typename T>
--- a/3rdparty/carotene/src/phase.cpp
+++ b/3rdparty/carotene/src/phase.cpp
@ -41,6 +41,7 @@
 #include <cmath>
 #include "common.hpp"
 #include "vround_helper.hpp"
 namespace CAROTENE_NS {
@ -121,8 +122,6 @@ void phase(const Size2D &size,
    size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
    float32x4_t v_05 = vdupq_n_f32(0.5f);
    for (size_t i = 0; i < size.height; ++i)
    {
        const s16 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
@ -149,8 +148,8 @@ void phase(const Size2D &size,
            float32x4_t v_dst32f1;
            FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1)
-            uint16x8_t v_dst16s0 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f0, v_05))),
+            uint16x8_t v_dst16s0 = vcombine_u16(vmovn_u32(internal::vroundq_u32_f32(v_dst32f0)),
-                                                vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f1, v_05))));
+                                                vmovn_u32(internal::vroundq_u32_f32(v_dst32f1)));
            // 1
            v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src01)));
@ -161,8 +160,8 @@ void phase(const Size2D &size,
            v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src11)));
            FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1)
-            uint16x8_t v_dst16s1 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f0, v_05))),
+            uint16x8_t v_dst16s1 = vcombine_u16(vmovn_u32(internal::vroundq_u32_f32(v_dst32f0)),
-                                                vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f1, v_05))));
+                                                vmovn_u32(internal::vroundq_u32_f32(v_dst32f1)));
            vst1q_u8(dst + j, vcombine_u8(vmovn_u16(v_dst16s0),
                                          vmovn_u16(v_dst16s1)));
@ -182,8 +181,8 @@ void phase(const Size2D &size,
            float32x4_t v_dst32f1;
            FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1)
-            uint16x8_t v_dst = vcombine_u16(vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f0, v_05))),
+            uint16x8_t v_dst = vcombine_u16(vmovn_u32(internal::vroundq_u32_f32(v_dst32f0)),
-                                            vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f1, v_05))));
+                                            vmovn_u32(internal::vroundq_u32_f32(v_dst32f1)));
            vst1_u8(dst + j, vmovn_u16(v_dst));
        }
--- a/3rdparty/carotene/src/vround_helper.hpp
+++ b/3rdparty/carotene/src/vround_helper.hpp
@ -0,0 +1,102 @@
 /*
 * By downloading, copying, installing or using the software you agree to this license.
 * If you do not agree to this license, do not download, install,
 * copy or use the software.
 *
 *
 *                           License Agreement
 *                For Open Source Computer Vision Library
 *                        (3-clause BSD License)
 *
 * Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
 * Third party copyrights are property of their respective owners.
 *
 * Redistribution and use in source and binary forms, with or without modification,
 * are permitted provided that the following conditions are met:
 *
 *   * Redistributions of source code must retain the above copyright notice,
 *     this list of conditions and the following disclaimer.
 *
 *   * Redistributions in binary form must reproduce the above copyright notice,
 *     this list of conditions and the following disclaimer in the documentation
 *     and/or other materials provided with the distribution.
 *
 *   * Neither the names of the copyright holders nor the names of the contributors
 *     may be used to endorse or promote products derived from this software
 *     without specific prior written permission.
 *
 * This software is provided by the copyright holders and contributors "as is" and
 * any express or implied warranties, including, but not limited to, the implied
 * warranties of merchantability and fitness for a particular purpose are disclaimed.
 * In no event shall copyright holders or contributors be liable for any direct,
 * indirect, incidental, special, exemplary, or consequential damages
 * (including, but not limited to, procurement of substitute goods or services;
 * loss of use, data, or profits; or business interruption) however caused
 * and on any theory of liability, whether in contract, strict liability,
 * or tort (including negligence or otherwise) arising in any way out of
 * the use of this software, even if advised of the possibility of such damage.
 */
 #ifndef CAROTENE_SRC_VROUND_HELPER_HPP
 #define CAROTENE_SRC_VROUND_HELPER_HPP
 #include "common.hpp"
 #include "vtransform.hpp"
 #ifdef CAROTENE_NEON
 /**
 * This helper header is for rounding from float32xN to uin32xN or int32xN to nearest, ties to even.
 * See https://en.wikipedia.org/wiki/Rounding#Rounding_half_to_even
 */
 // See https://github.com/opencv/opencv/pull/24271#issuecomment-1867318007
 #define CAROTENE_ROUND_DELTA (12582912.0f)
 namespace CAROTENE_NS { namespace internal {
 inline uint32x4_t vroundq_u32_f32(const float32x4_t val)
 {
 #if CAROTENE_NEON_ARCH >= 8 /* get ready for ARMv9 */
    return vcvtnq_u32_f32(val);
 #else
    const float32x4_t delta = vdupq_n_f32(CAROTENE_ROUND_DELTA);
    return vcvtq_u32_f32(vsubq_f32(vaddq_f32(val, delta), delta));
 #endif
 }
 inline uint32x2_t vround_u32_f32(const float32x2_t val)
 {
 #if CAROTENE_NEON_ARCH >= 8 /* get ready for ARMv9 */
    return vcvtn_u32_f32(val);
 #else
    const float32x2_t delta = vdup_n_f32(CAROTENE_ROUND_DELTA);
    return vcvt_u32_f32(vsub_f32(vadd_f32(val, delta), delta));
 #endif
 }
 inline int32x4_t vroundq_s32_f32(const float32x4_t val)
 {
 #if CAROTENE_NEON_ARCH >= 8 /* get ready for ARMv9 */
    return vcvtnq_s32_f32(val);
 #else
    const float32x4_t delta = vdupq_n_f32(CAROTENE_ROUND_DELTA);
    return vcvtq_s32_f32(vsubq_f32(vaddq_f32(val, delta), delta));
 #endif
 }
 inline int32x2_t vround_s32_f32(const float32x2_t val)
 {
 #if CAROTENE_NEON_ARCH >= 8 /* get ready for ARMv9 */
    return vcvtn_s32_f32(val);
 #else
    const float32x2_t delta = vdup_n_f32(CAROTENE_ROUND_DELTA);
    return vcvt_s32_f32(vsub_f32(vadd_f32(val, delta), delta));
 #endif
 }
 } }
 #endif // CAROTENE_NEON
 #endif
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -915,7 +915,15 @@ foreach(hal ${OpenCV_HAL})
    if(";${CPU_BASELINE_FINAL};" MATCHES ";NEON;")
      add_subdirectory(3rdparty/carotene/hal)
      ocv_hal_register(CAROTENE_HAL_LIBRARIES CAROTENE_HAL_HEADERS CAROTENE_HAL_INCLUDE_DIRS)
-      list(APPEND OpenCV_USED_HAL "carotene (ver ${CAROTENE_HAL_VERSION})")
+
      if( NOT DEFINED CAROTENE_NEON_ARCH)
          set(CAROTENE_NEON_MSG "Auto detected")
      elseif( CAROTENE_NEON_ARCH GREATER 7)
          set(CAROTENE_NEON_MSG "Force ARMv8+")
      else()
          set(CAROTENE_NEON_MSG "Force ARMv7")
      endif()
      list(APPEND OpenCV_USED_HAL "carotene (ver ${CAROTENE_HAL_VERSION}, ${CAROTENE_NEON_MSG})")
    else()
      message(STATUS "Carotene: NEON is not available, disabling carotene...")
    endif()
--- a/doc/tutorials/introduction/config_reference/config_reference.markdown
+++ b/doc/tutorials/introduction/config_reference/config_reference.markdown
@ -585,6 +585,7 @@ Following options can be used to change installation layout for common scenarios
 | `BUILD_FAT_JAVA_LIB` | _ON_ (for static Android builds) | Build single _opencv_java_ dynamic library containing all library functionality bundled with Java bindings. |
 | `BUILD_opencv_python2` | _ON_ | Build python2 bindings (deprecated). Python with development files and numpy must be installed. |
 | `BUILD_opencv_python3` | _ON_ | Build python3 bindings. Python with development files and numpy must be installed. |
 | `CAROTENE_NEON_ARCH` | '(auto)' | Switch NEON Arch for Carotene. If it sets nothing, it will be auto-detected. If it sets 8, ARMv8(and later) is used. Otherwise, ARMv7 is used. |
 TODO: need separate tutorials covering bindings builds
--- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
@ -1990,11 +1990,9 @@ inline v_int32x4 v_round(const v_float32x4& a)
 #else
 inline v_int32x4 v_round(const v_float32x4& a)
 {
-    static const int32x4_t v_sign = vdupq_n_s32(1 << 31),
+    // See https://github.com/opencv/opencv/pull/24271#issuecomment-1867318007
-        v_05 = vreinterpretq_s32_f32(vdupq_n_f32(0.5f));
+    float32x4_t delta = vdupq_n_f32(12582912.0f);
-
+    return v_int32x4(vcvtq_s32_f32(vsubq_f32(vaddq_f32(a.val, delta), delta)));
    int32x4_t v_addition = vorrq_s32(v_05, vandq_s32(v_sign, vreinterpretq_s32_f32(a.val)));
    return v_int32x4(vcvtq_s32_f32(vaddq_f32(a.val, vreinterpretq_f32_s32(v_addition))));
 }
 #endif
 inline v_int32x4 v_floor(const v_float32x4& a)
--- a/modules/core/test/test_operations.cpp
+++ b/modules/core/test/test_operations.cpp
@ -1570,4 +1570,54 @@ TEST(Core_Arithm, scalar_handling_19599)  // https://github.com/opencv/opencv/is
    EXPECT_EQ(1, c.rows);
 }
 // https://github.com/opencv/opencv/issues/24163
 typedef tuple<perf::MatDepth,int,int,int> Arith_Regression24163Param;
 typedef testing::TestWithParam<Arith_Regression24163Param> Core_Arith_Regression24163;
 #if defined __riscv
 TEST_P(Core_Arith_Regression24163, DISABLED_test_for_ties_to_even)
 #else
 TEST_P(Core_Arith_Regression24163, test_for_ties_to_even)
 #endif
 {
    const int matDepth = get<0>(GetParam());
    const int matHeight= get<1>(GetParam());
    const int matWidth = 3; // Fixed
    const int alpha    = get<2>(GetParam());
    const int beta     = get<3>(GetParam());
    // If alpha and/or beta are negative, and matDepth is unsigned, test is passed.
    if( ( (alpha < 0) || (beta < 0) )
        &&
        ( (matDepth != CV_8S) && (matDepth != CV_16S) && (matDepth != CV_32S) ) )
    {
        throw SkipTestException( cv::format("Test is skipped(matDepth is not signed, alpha = %d, beta = %d)", alpha, beta) );
    }
    const int matType = CV_MAKE_TYPE(matDepth, 1);
    const Size matSize(matWidth, matHeight);
    const Mat src1(matSize, matType, Scalar(alpha,alpha,alpha,alpha));
    const Mat src2(matSize, matType, Scalar(beta, beta, beta, beta));
    const Mat result = ( src1 + src2 ) / 2;
    // Expected that default is FE_TONEAREST(Ties to Even).
    const int mean = lrint( static_cast<double>(alpha + beta) / 2.0 );
    const Mat expected(matSize, matType, Scalar(mean,mean,mean,mean));
    // Compare result and extected.
    ASSERT_EQ(expected.size(), result.size());
    EXPECT_EQ(0, cvtest::norm(expected, result, NORM_INF)) <<
        "result=" << std::endl << result << std::endl <<
        "expected=" << std::endl << expected;
 }
 INSTANTIATE_TEST_CASE_P(/* */, Core_Arith_Regression24163,
    testing::Combine(
        testing::Values(perf::MatDepth(CV_8U), CV_8S, CV_16U, CV_16S, CV_32S), // MatType
        testing::Values( 3, 4, 5, 6),    // MatHeight
        testing::Values(-2,-1, 0, 1, 2), // src1
        testing::Values(   -1, 0, 1   )  // src2
    )
 );
 }} // namespace