Merge pull request #24271 from Kumataro:fix24163

Fix to convert float32 to int32/uint32 with rounding to nearest (ties to even). #24271

Fix https://github.com/opencv/opencv/issues/24163

### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [x] The PR is proposed to the proper branch
- [x] There is a reference to the original bug report and related work
- [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [x] The feature is well documented and sample code can be built with the project CMake

(carotene is BSD)
This commit is contained in:
Kumataro 2023-12-25 18:17:17 +09:00 committed by GitHub
parent d9d402916a
commit dba7186378
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 323 additions and 164 deletions

View File

@ -42,6 +42,14 @@ endif()
if(WITH_NEON) if(WITH_NEON)
target_compile_definitions(carotene_objs PRIVATE "-DWITH_NEON") target_compile_definitions(carotene_objs PRIVATE "-DWITH_NEON")
if(NOT DEFINED CAROTENE_NEON_ARCH )
elseif(CAROTENE_NEON_ARCH EQUAL 8)
target_compile_definitions(carotene_objs PRIVATE "-DCAROTENE_NEON_ARCH=8")
elseif(CAROTENE_NEON_ARCH EQUAL 7)
target_compile_definitions(carotene_objs PRIVATE "-DCAROTENE_NEON_ARCH=7")
else()
target_compile_definitions(carotene_objs PRIVATE "-DCAROTENE_NEON_ARCH=0")
endif()
endif() endif()
# we add dummy file to fix XCode build # we add dummy file to fix XCode build

View File

@ -39,6 +39,7 @@
#include "common.hpp" #include "common.hpp"
#include "vtransform.hpp" #include "vtransform.hpp"
#include "vround_helper.hpp"
namespace CAROTENE_NS { namespace CAROTENE_NS {
@ -106,7 +107,7 @@ template <> struct wAdd<s32>
{ {
valpha = vdupq_n_f32(_alpha); valpha = vdupq_n_f32(_alpha);
vbeta = vdupq_n_f32(_beta); vbeta = vdupq_n_f32(_beta);
vgamma = vdupq_n_f32(_gamma + 0.5); vgamma = vdupq_n_f32(_gamma);
} }
void operator() (const VecTraits<s32>::vec128 & v_src0, void operator() (const VecTraits<s32>::vec128 & v_src0,
@ -118,7 +119,7 @@ template <> struct wAdd<s32>
vs1 = vmlaq_f32(vgamma, vs1, valpha); vs1 = vmlaq_f32(vgamma, vs1, valpha);
vs1 = vmlaq_f32(vs1, vs2, vbeta); vs1 = vmlaq_f32(vs1, vs2, vbeta);
v_dst = vcvtq_s32_f32(vs1); v_dst = vroundq_s32_f32(vs1);
} }
void operator() (const VecTraits<s32>::vec64 & v_src0, void operator() (const VecTraits<s32>::vec64 & v_src0,
@ -130,7 +131,7 @@ template <> struct wAdd<s32>
vs1 = vmla_f32(vget_low(vgamma), vs1, vget_low(valpha)); vs1 = vmla_f32(vget_low(vgamma), vs1, vget_low(valpha));
vs1 = vmla_f32(vs1, vs2, vget_low(vbeta)); vs1 = vmla_f32(vs1, vs2, vget_low(vbeta));
v_dst = vcvt_s32_f32(vs1); v_dst = vround_s32_f32(vs1);
} }
void operator() (const s32 * src0, const s32 * src1, s32 * dst) const void operator() (const s32 * src0, const s32 * src1, s32 * dst) const
@ -150,7 +151,7 @@ template <> struct wAdd<u32>
{ {
valpha = vdupq_n_f32(_alpha); valpha = vdupq_n_f32(_alpha);
vbeta = vdupq_n_f32(_beta); vbeta = vdupq_n_f32(_beta);
vgamma = vdupq_n_f32(_gamma + 0.5); vgamma = vdupq_n_f32(_gamma);
} }
void operator() (const VecTraits<u32>::vec128 & v_src0, void operator() (const VecTraits<u32>::vec128 & v_src0,
@ -162,7 +163,7 @@ template <> struct wAdd<u32>
vs1 = vmlaq_f32(vgamma, vs1, valpha); vs1 = vmlaq_f32(vgamma, vs1, valpha);
vs1 = vmlaq_f32(vs1, vs2, vbeta); vs1 = vmlaq_f32(vs1, vs2, vbeta);
v_dst = vcvtq_u32_f32(vs1); v_dst = vroundq_u32_f32(vs1);
} }
void operator() (const VecTraits<u32>::vec64 & v_src0, void operator() (const VecTraits<u32>::vec64 & v_src0,
@ -174,7 +175,7 @@ template <> struct wAdd<u32>
vs1 = vmla_f32(vget_low(vgamma), vs1, vget_low(valpha)); vs1 = vmla_f32(vget_low(vgamma), vs1, vget_low(valpha));
vs1 = vmla_f32(vs1, vs2, vget_low(vbeta)); vs1 = vmla_f32(vs1, vs2, vget_low(vbeta));
v_dst = vcvt_u32_f32(vs1); v_dst = vround_u32_f32(vs1);
} }
void operator() (const u32 * src0, const u32 * src1, u32 * dst) const void operator() (const u32 * src0, const u32 * src1, u32 * dst) const

View File

@ -41,6 +41,7 @@
#include "common.hpp" #include "common.hpp"
#include "saturate_cast.hpp" #include "saturate_cast.hpp"
#include "vround_helper.hpp"
namespace CAROTENE_NS { namespace CAROTENE_NS {
@ -198,7 +199,6 @@ void blur3x3(const Size2D &size, s32 cn,
//#define FLOAT_VARIANT_1_9 //#define FLOAT_VARIANT_1_9
#ifdef FLOAT_VARIANT_1_9 #ifdef FLOAT_VARIANT_1_9
float32x4_t v1_9 = vdupq_n_f32 (1.0/9.0); float32x4_t v1_9 = vdupq_n_f32 (1.0/9.0);
float32x4_t v0_5 = vdupq_n_f32 (.5);
#else #else
const int16x8_t vScale = vmovq_n_s16(3640); const int16x8_t vScale = vmovq_n_s16(3640);
#endif #endif
@ -283,8 +283,8 @@ void blur3x3(const Size2D &size, s32 cn,
uint32x4_t tres2 = vmovl_u16(vget_high_u16(t0)); uint32x4_t tres2 = vmovl_u16(vget_high_u16(t0));
float32x4_t vf1 = vmulq_f32(v1_9, vcvtq_f32_u32(tres1)); float32x4_t vf1 = vmulq_f32(v1_9, vcvtq_f32_u32(tres1));
float32x4_t vf2 = vmulq_f32(v1_9, vcvtq_f32_u32(tres2)); float32x4_t vf2 = vmulq_f32(v1_9, vcvtq_f32_u32(tres2));
tres1 = vcvtq_u32_f32(vaddq_f32(vf1, v0_5)); tres1 = internal::vroundq_u32_f32(vf1);
tres2 = vcvtq_u32_f32(vaddq_f32(vf2, v0_5)); tres2 = internal::vroundq_u32_f32(vf2);
t0 = vcombine_u16(vmovn_u32(tres1),vmovn_u32(tres2)); t0 = vcombine_u16(vmovn_u32(tres1),vmovn_u32(tres2));
vst1_u8(drow + x - 8, vmovn_u16(t0)); vst1_u8(drow + x - 8, vmovn_u16(t0));
#else #else
@ -445,8 +445,8 @@ void blur3x3(const Size2D &size, s32 cn,
uint32x4_t tres2 = vmovl_u16(vget_high_u16(t0)); uint32x4_t tres2 = vmovl_u16(vget_high_u16(t0));
float32x4_t vf1 = vmulq_f32(v1_9, vcvtq_f32_u32(tres1)); float32x4_t vf1 = vmulq_f32(v1_9, vcvtq_f32_u32(tres1));
float32x4_t vf2 = vmulq_f32(v1_9, vcvtq_f32_u32(tres2)); float32x4_t vf2 = vmulq_f32(v1_9, vcvtq_f32_u32(tres2));
tres1 = vcvtq_u32_f32(vaddq_f32(vf1, v0_5)); tres1 = internal::vroundq_u32_f32(vf1);
tres2 = vcvtq_u32_f32(vaddq_f32(vf2, v0_5)); tres2 = internal::vroundq_u32_f32(vf2);
t0 = vcombine_u16(vmovn_u32(tres1),vmovn_u32(tres2)); t0 = vcombine_u16(vmovn_u32(tres1),vmovn_u32(tres2));
vst1_u8(drow + x - 8, vmovn_u16(t0)); vst1_u8(drow + x - 8, vmovn_u16(t0));
#else #else
@ -508,7 +508,6 @@ void blur5x5(const Size2D &size, s32 cn,
#define FLOAT_VARIANT_1_25 #define FLOAT_VARIANT_1_25
#ifdef FLOAT_VARIANT_1_25 #ifdef FLOAT_VARIANT_1_25
float32x4_t v1_25 = vdupq_n_f32 (1.0f/25.0f); float32x4_t v1_25 = vdupq_n_f32 (1.0f/25.0f);
float32x4_t v0_5 = vdupq_n_f32 (.5f);
#else #else
const int16x8_t vScale = vmovq_n_s16(1310); const int16x8_t vScale = vmovq_n_s16(1310);
#endif #endif
@ -752,8 +751,8 @@ void blur5x5(const Size2D &size, s32 cn,
uint32x4_t tres2 = vmovl_u16(vget_high_u16(t0)); uint32x4_t tres2 = vmovl_u16(vget_high_u16(t0));
float32x4_t vf1 = vmulq_f32(v1_25, vcvtq_f32_u32(tres1)); float32x4_t vf1 = vmulq_f32(v1_25, vcvtq_f32_u32(tres1));
float32x4_t vf2 = vmulq_f32(v1_25, vcvtq_f32_u32(tres2)); float32x4_t vf2 = vmulq_f32(v1_25, vcvtq_f32_u32(tres2));
tres1 = vcvtq_u32_f32(vaddq_f32(vf1, v0_5)); tres1 = internal::vroundq_u32_f32(vf1);
tres2 = vcvtq_u32_f32(vaddq_f32(vf2, v0_5)); tres2 = internal::vroundq_u32_f32(vf2);
t0 = vcombine_u16(vmovn_u32(tres1),vmovn_u32(tres2)); t0 = vcombine_u16(vmovn_u32(tres1),vmovn_u32(tres2));
vst1_u8(drow + x - 8, vmovn_u16(t0)); vst1_u8(drow + x - 8, vmovn_u16(t0));
#else #else

View File

@ -40,6 +40,7 @@
#include "common.hpp" #include "common.hpp"
#include "saturate_cast.hpp" #include "saturate_cast.hpp"
#include "vround_helper.hpp"
namespace CAROTENE_NS { namespace CAROTENE_NS {
@ -1166,17 +1167,10 @@ inline uint8x8x3_t convertToHSV(const uint8x8_t vR, const uint8x8_t vG, const ui
vSt3 = vmulq_f32(vHF1, vDivTab); vSt3 = vmulq_f32(vHF1, vDivTab);
vSt4 = vmulq_f32(vHF2, vDivTab); vSt4 = vmulq_f32(vHF2, vDivTab);
float32x4_t bias = vdupq_n_f32(0.5f); uint32x4_t vRes1 = internal::vroundq_u32_f32(vSt1);
uint32x4_t vRes2 = internal::vroundq_u32_f32(vSt2);
vSt1 = vaddq_f32(vSt1, bias); uint32x4_t vRes3 = internal::vroundq_u32_f32(vSt3);
vSt2 = vaddq_f32(vSt2, bias); uint32x4_t vRes4 = internal::vroundq_u32_f32(vSt4);
vSt3 = vaddq_f32(vSt3, bias);
vSt4 = vaddq_f32(vSt4, bias);
uint32x4_t vRes1 = vcvtq_u32_f32(vSt1);
uint32x4_t vRes2 = vcvtq_u32_f32(vSt2);
uint32x4_t vRes3 = vcvtq_u32_f32(vSt3);
uint32x4_t vRes4 = vcvtq_u32_f32(vSt4);
int32x4_t vH_L = vmovl_s16(vget_low_s16(vDiff4)); int32x4_t vH_L = vmovl_s16(vget_low_s16(vDiff4));
int32x4_t vH_H = vmovl_s16(vget_high_s16(vDiff4)); int32x4_t vH_H = vmovl_s16(vget_high_s16(vDiff4));

View File

@ -58,6 +58,17 @@
namespace CAROTENE_NS { namespace internal { namespace CAROTENE_NS { namespace internal {
#ifndef CAROTENE_NEON_ARCH
# if defined(__aarch64__) || defined(__aarch32__)
# define CAROTENE_NEON_ARCH 8
# else
# define CAROTENE_NEON_ARCH 7
# endif
#endif
#if ( !defined(__aarch64__) && !defined(__aarch32__) ) && (CAROTENE_NEON_ARCH == 8 )
# error("ARMv7 doen't support A32/A64 Neon instructions")
#endif
inline void prefetch(const void *ptr, size_t offset = 32*10) inline void prefetch(const void *ptr, size_t offset = 32*10)
{ {
#if defined __GNUC__ #if defined __GNUC__

View File

@ -38,6 +38,7 @@
*/ */
#include "common.hpp" #include "common.hpp"
#include "vround_helper.hpp"
namespace CAROTENE_NS { namespace CAROTENE_NS {
@ -185,7 +186,7 @@ CVTS_FUNC1(u8, 16,
#else #else
CVTS_FUNC1(u8, 16, CVTS_FUNC1(u8, 16,
float32x4_t vscale = vdupq_n_f32((f32)alpha); float32x4_t vscale = vdupq_n_f32((f32)alpha);
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, float32x4_t vshift = vdupq_n_f32((f32)beta);,
{ {
for (size_t i = 0; i < w; i += 16) for (size_t i = 0; i < w; i += 16)
{ {
@ -209,10 +210,10 @@ CVTS_FUNC1(u8, 16,
vline2_f32 = vaddq_f32(vline2_f32, vshift); vline2_f32 = vaddq_f32(vline2_f32, vshift);
vline3_f32 = vaddq_f32(vline3_f32, vshift); vline3_f32 = vaddq_f32(vline3_f32, vshift);
vline4_f32 = vaddq_f32(vline4_f32, vshift); vline4_f32 = vaddq_f32(vline4_f32, vshift);
int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32); int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32); int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32); int32x4_t vline3_s32 = internal::vroundq_s32_f32(vline3_f32);
int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32); int32x4_t vline4_s32 = internal::vroundq_s32_f32(vline4_f32);
uint16x8_t vRes1_u16 = vcombine_u16(vqmovun_s32(vline1_s32), vqmovun_s32(vline2_s32)); uint16x8_t vRes1_u16 = vcombine_u16(vqmovun_s32(vline1_s32), vqmovun_s32(vline2_s32));
uint16x8_t vRes2_u16 = vcombine_u16(vqmovun_s32(vline3_s32), vqmovun_s32(vline4_s32)); uint16x8_t vRes2_u16 = vcombine_u16(vqmovun_s32(vline3_s32), vqmovun_s32(vline4_s32));
vst1q_u8(_dst + i, vcombine_u8(vqmovn_u16(vRes1_u16), vqmovn_u16(vRes2_u16))); vst1q_u8(_dst + i, vcombine_u8(vqmovn_u16(vRes1_u16), vqmovn_u16(vRes2_u16)));
@ -270,7 +271,7 @@ CVTS_FUNC(u8, s8, 16,
#else #else
CVTS_FUNC(u8, s8, 16, CVTS_FUNC(u8, s8, 16,
float32x4_t vscale = vdupq_n_f32((f32)alpha); float32x4_t vscale = vdupq_n_f32((f32)alpha);
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, float32x4_t vshift = vdupq_n_f32((f32)beta);,
{ {
for (size_t i = 0; i < w; i += 16) for (size_t i = 0; i < w; i += 16)
{ {
@ -294,10 +295,10 @@ CVTS_FUNC(u8, s8, 16,
vline2_f32 = vaddq_f32(vline2_f32, vshift); vline2_f32 = vaddq_f32(vline2_f32, vshift);
vline3_f32 = vaddq_f32(vline3_f32, vshift); vline3_f32 = vaddq_f32(vline3_f32, vshift);
vline4_f32 = vaddq_f32(vline4_f32, vshift); vline4_f32 = vaddq_f32(vline4_f32, vshift);
int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32); int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32); int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32); int32x4_t vline3_s32 = internal::vroundq_s32_f32(vline3_f32);
int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32); int32x4_t vline4_s32 = internal::vroundq_s32_f32(vline4_f32);
int16x8_t vRes1_u16 = vcombine_s16(vqmovn_s32(vline1_s32), vqmovn_s32(vline2_s32)); int16x8_t vRes1_u16 = vcombine_s16(vqmovn_s32(vline1_s32), vqmovn_s32(vline2_s32));
int16x8_t vRes2_u16 = vcombine_s16(vqmovn_s32(vline3_s32), vqmovn_s32(vline4_s32)); int16x8_t vRes2_u16 = vcombine_s16(vqmovn_s32(vline3_s32), vqmovn_s32(vline4_s32));
vst1q_s8(_dst + i, vcombine_s8(vqmovn_s16(vRes1_u16), vqmovn_s16(vRes2_u16))); vst1q_s8(_dst + i, vcombine_s8(vqmovn_s16(vRes1_u16), vqmovn_s16(vRes2_u16)));
@ -355,7 +356,7 @@ CVTS_FUNC(u8, u16, 16,
#else #else
CVTS_FUNC(u8, u16, 16, CVTS_FUNC(u8, u16, 16,
float32x4_t vscale = vdupq_n_f32((f32)alpha); float32x4_t vscale = vdupq_n_f32((f32)alpha);
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, float32x4_t vshift = vdupq_n_f32((f32)beta);,
{ {
for (size_t i = 0; i < w; i += 16) for (size_t i = 0; i < w; i += 16)
{ {
@ -379,10 +380,10 @@ CVTS_FUNC(u8, u16, 16,
vline2_f32 = vaddq_f32(vline2_f32, vshift); vline2_f32 = vaddq_f32(vline2_f32, vshift);
vline3_f32 = vaddq_f32(vline3_f32, vshift); vline3_f32 = vaddq_f32(vline3_f32, vshift);
vline4_f32 = vaddq_f32(vline4_f32, vshift); vline4_f32 = vaddq_f32(vline4_f32, vshift);
int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32); int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32); int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32); int32x4_t vline3_s32 = internal::vroundq_s32_f32(vline3_f32);
int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32); int32x4_t vline4_s32 = internal::vroundq_s32_f32(vline4_f32);
vst1q_u16(_dst + i + 0, vcombine_u16(vqmovun_s32(vline1_s32), vqmovun_s32(vline2_s32))); vst1q_u16(_dst + i + 0, vcombine_u16(vqmovun_s32(vline1_s32), vqmovun_s32(vline2_s32)));
vst1q_u16(_dst + i + 8, vcombine_u16(vqmovun_s32(vline3_s32), vqmovun_s32(vline4_s32))); vst1q_u16(_dst + i + 8, vcombine_u16(vqmovun_s32(vline3_s32), vqmovun_s32(vline4_s32)));
} }
@ -439,7 +440,7 @@ CVTS_FUNC(u8, s16, 16,
#else #else
CVTS_FUNC(u8, s16, 16, CVTS_FUNC(u8, s16, 16,
float32x4_t vscale = vdupq_n_f32((f32)alpha); float32x4_t vscale = vdupq_n_f32((f32)alpha);
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, float32x4_t vshift = vdupq_n_f32((f32)beta);,
{ {
for (size_t i = 0; i < w; i += 16) for (size_t i = 0; i < w; i += 16)
{ {
@ -463,10 +464,10 @@ CVTS_FUNC(u8, s16, 16,
vline2_f32 = vaddq_f32(vline2_f32, vshift); vline2_f32 = vaddq_f32(vline2_f32, vshift);
vline3_f32 = vaddq_f32(vline3_f32, vshift); vline3_f32 = vaddq_f32(vline3_f32, vshift);
vline4_f32 = vaddq_f32(vline4_f32, vshift); vline4_f32 = vaddq_f32(vline4_f32, vshift);
int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32); int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32); int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32); int32x4_t vline3_s32 = internal::vroundq_s32_f32(vline3_f32);
int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32); int32x4_t vline4_s32 = internal::vroundq_s32_f32(vline4_f32);
vst1q_s16(_dst + i + 0, vcombine_s16(vqmovn_s32(vline1_s32), vqmovn_s32(vline2_s32))); vst1q_s16(_dst + i + 0, vcombine_s16(vqmovn_s32(vline1_s32), vqmovn_s32(vline2_s32)));
vst1q_s16(_dst + i + 8, vcombine_s16(vqmovn_s32(vline3_s32), vqmovn_s32(vline4_s32))); vst1q_s16(_dst + i + 8, vcombine_s16(vqmovn_s32(vline3_s32), vqmovn_s32(vline4_s32)));
} }
@ -526,7 +527,7 @@ CVTS_FUNC(u8, s32, 16,
#else #else
CVTS_FUNC(u8, s32, 16, CVTS_FUNC(u8, s32, 16,
float32x4_t vscale = vdupq_n_f32((f32)alpha); float32x4_t vscale = vdupq_n_f32((f32)alpha);
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, float32x4_t vshift = vdupq_n_f32((f32)beta);,
{ {
for (size_t i = 0; i < w; i += 16) for (size_t i = 0; i < w; i += 16)
{ {
@ -550,10 +551,10 @@ CVTS_FUNC(u8, s32, 16,
vline2_f32 = vaddq_f32(vline2_f32, vshift); vline2_f32 = vaddq_f32(vline2_f32, vshift);
vline3_f32 = vaddq_f32(vline3_f32, vshift); vline3_f32 = vaddq_f32(vline3_f32, vshift);
vline4_f32 = vaddq_f32(vline4_f32, vshift); vline4_f32 = vaddq_f32(vline4_f32, vshift);
int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32); int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32); int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32); int32x4_t vline3_s32 = internal::vroundq_s32_f32(vline3_f32);
int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32); int32x4_t vline4_s32 = internal::vroundq_s32_f32(vline4_f32);
vst1q_s32(_dst + i + 0, vline1_s32); vst1q_s32(_dst + i + 0, vline1_s32);
vst1q_s32(_dst + i + 4, vline2_s32); vst1q_s32(_dst + i + 4, vline2_s32);
vst1q_s32(_dst + i + 8, vline3_s32); vst1q_s32(_dst + i + 8, vline3_s32);
@ -693,7 +694,7 @@ CVTS_FUNC(s8, u8, 16,
#else #else
CVTS_FUNC(s8, u8, 16, CVTS_FUNC(s8, u8, 16,
float32x4_t vscale = vdupq_n_f32((f32)alpha); float32x4_t vscale = vdupq_n_f32((f32)alpha);
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, float32x4_t vshift = vdupq_n_f32((f32)beta);,
{ {
for (size_t i = 0; i < w; i += 16) for (size_t i = 0; i < w; i += 16)
{ {
@ -717,10 +718,10 @@ CVTS_FUNC(s8, u8, 16,
vline2_f32 = vaddq_f32(vline2_f32, vshift); vline2_f32 = vaddq_f32(vline2_f32, vshift);
vline3_f32 = vaddq_f32(vline3_f32, vshift); vline3_f32 = vaddq_f32(vline3_f32, vshift);
vline4_f32 = vaddq_f32(vline4_f32, vshift); vline4_f32 = vaddq_f32(vline4_f32, vshift);
vline1_s32 = vcvtq_s32_f32(vline1_f32); vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
vline2_s32 = vcvtq_s32_f32(vline2_f32); vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
vline3_s32 = vcvtq_s32_f32(vline3_f32); vline3_s32 = internal::vroundq_s32_f32(vline3_f32);
vline4_s32 = vcvtq_s32_f32(vline4_f32); vline4_s32 = internal::vroundq_s32_f32(vline4_f32);
uint16x8_t vRes1_u16 = vcombine_u16(vqmovun_s32(vline1_s32), vqmovun_s32(vline2_s32)); uint16x8_t vRes1_u16 = vcombine_u16(vqmovun_s32(vline1_s32), vqmovun_s32(vline2_s32));
uint16x8_t vRes2_u16 = vcombine_u16(vqmovun_s32(vline3_s32), vqmovun_s32(vline4_s32)); uint16x8_t vRes2_u16 = vcombine_u16(vqmovun_s32(vline3_s32), vqmovun_s32(vline4_s32));
vst1q_u8(_dst + i, vcombine_u8(vqmovn_u16(vRes1_u16), vqmovn_u16(vRes2_u16))); vst1q_u8(_dst + i, vcombine_u8(vqmovn_u16(vRes1_u16), vqmovn_u16(vRes2_u16)));
@ -778,7 +779,7 @@ CVTS_FUNC1(s8, 16,
#else #else
CVTS_FUNC1(s8, 16, CVTS_FUNC1(s8, 16,
float32x4_t vscale = vdupq_n_f32((f32)alpha); float32x4_t vscale = vdupq_n_f32((f32)alpha);
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, float32x4_t vshift = vdupq_n_f32((f32)beta);,
{ {
for (size_t i = 0; i < w; i += 16) for (size_t i = 0; i < w; i += 16)
{ {
@ -802,10 +803,10 @@ CVTS_FUNC1(s8, 16,
vline2_f32 = vaddq_f32(vline2_f32, vshift); vline2_f32 = vaddq_f32(vline2_f32, vshift);
vline3_f32 = vaddq_f32(vline3_f32, vshift); vline3_f32 = vaddq_f32(vline3_f32, vshift);
vline4_f32 = vaddq_f32(vline4_f32, vshift); vline4_f32 = vaddq_f32(vline4_f32, vshift);
vline1_s32 = vcvtq_s32_f32(vline1_f32); vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
vline2_s32 = vcvtq_s32_f32(vline2_f32); vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
vline3_s32 = vcvtq_s32_f32(vline3_f32); vline3_s32 = internal::vroundq_s32_f32(vline3_f32);
vline4_s32 = vcvtq_s32_f32(vline4_f32); vline4_s32 = internal::vroundq_s32_f32(vline4_f32);
int16x8_t vRes1_s16 = vcombine_s16(vqmovn_s32(vline1_s32), vqmovn_s32(vline2_s32)); int16x8_t vRes1_s16 = vcombine_s16(vqmovn_s32(vline1_s32), vqmovn_s32(vline2_s32));
int16x8_t vRes2_s16 = vcombine_s16(vqmovn_s32(vline3_s32), vqmovn_s32(vline4_s32)); int16x8_t vRes2_s16 = vcombine_s16(vqmovn_s32(vline3_s32), vqmovn_s32(vline4_s32));
vst1q_s8(_dst + i, vcombine_s8(vqmovn_s16(vRes1_s16), vqmovn_s16(vRes2_s16))); vst1q_s8(_dst + i, vcombine_s8(vqmovn_s16(vRes1_s16), vqmovn_s16(vRes2_s16)));
@ -863,7 +864,7 @@ CVTS_FUNC(s8, u16, 16,
#else #else
CVTS_FUNC(s8, u16, 16, CVTS_FUNC(s8, u16, 16,
float32x4_t vscale = vdupq_n_f32((f32)alpha); float32x4_t vscale = vdupq_n_f32((f32)alpha);
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, float32x4_t vshift = vdupq_n_f32((f32)beta);,
{ {
for (size_t i = 0; i < w; i += 16) for (size_t i = 0; i < w; i += 16)
{ {
@ -887,10 +888,10 @@ CVTS_FUNC(s8, u16, 16,
vline2_f32 = vaddq_f32(vline2_f32, vshift); vline2_f32 = vaddq_f32(vline2_f32, vshift);
vline3_f32 = vaddq_f32(vline3_f32, vshift); vline3_f32 = vaddq_f32(vline3_f32, vshift);
vline4_f32 = vaddq_f32(vline4_f32, vshift); vline4_f32 = vaddq_f32(vline4_f32, vshift);
vline1_s32 = vcvtq_s32_f32(vline1_f32); vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
vline2_s32 = vcvtq_s32_f32(vline2_f32); vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
vline3_s32 = vcvtq_s32_f32(vline3_f32); vline3_s32 = internal::vroundq_s32_f32(vline3_f32);
vline4_s32 = vcvtq_s32_f32(vline4_f32); vline4_s32 = internal::vroundq_s32_f32(vline4_f32);
uint16x8_t vRes1_u16 = vcombine_u16(vqmovun_s32(vline1_s32), vqmovun_s32(vline2_s32)); uint16x8_t vRes1_u16 = vcombine_u16(vqmovun_s32(vline1_s32), vqmovun_s32(vline2_s32));
uint16x8_t vRes2_u16 = vcombine_u16(vqmovun_s32(vline3_s32), vqmovun_s32(vline4_s32)); uint16x8_t vRes2_u16 = vcombine_u16(vqmovun_s32(vline3_s32), vqmovun_s32(vline4_s32));
vst1q_u16(_dst + i + 0, vRes1_u16); vst1q_u16(_dst + i + 0, vRes1_u16);
@ -949,7 +950,7 @@ CVTS_FUNC(s8, s16, 16,
#else #else
CVTS_FUNC(s8, s16, 16, CVTS_FUNC(s8, s16, 16,
float32x4_t vscale = vdupq_n_f32((f32)alpha); float32x4_t vscale = vdupq_n_f32((f32)alpha);
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, float32x4_t vshift = vdupq_n_f32((f32)beta);,
{ {
for (size_t i = 0; i < w; i += 16) for (size_t i = 0; i < w; i += 16)
{ {
@ -973,10 +974,10 @@ CVTS_FUNC(s8, s16, 16,
vline2_f32 = vaddq_f32(vline2_f32, vshift); vline2_f32 = vaddq_f32(vline2_f32, vshift);
vline3_f32 = vaddq_f32(vline3_f32, vshift); vline3_f32 = vaddq_f32(vline3_f32, vshift);
vline4_f32 = vaddq_f32(vline4_f32, vshift); vline4_f32 = vaddq_f32(vline4_f32, vshift);
vline1_s32 = vcvtq_s32_f32(vline1_f32); vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
vline2_s32 = vcvtq_s32_f32(vline2_f32); vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
vline3_s32 = vcvtq_s32_f32(vline3_f32); vline3_s32 = internal::vroundq_s32_f32(vline3_f32);
vline4_s32 = vcvtq_s32_f32(vline4_f32); vline4_s32 = internal::vroundq_s32_f32(vline4_f32);
int16x8_t vRes1_s16 = vcombine_s16(vqmovn_s32(vline1_s32), vqmovn_s32(vline2_s32)); int16x8_t vRes1_s16 = vcombine_s16(vqmovn_s32(vline1_s32), vqmovn_s32(vline2_s32));
int16x8_t vRes2_s16 = vcombine_s16(vqmovn_s32(vline3_s32), vqmovn_s32(vline4_s32)); int16x8_t vRes2_s16 = vcombine_s16(vqmovn_s32(vline3_s32), vqmovn_s32(vline4_s32));
vst1q_s16(_dst + i + 0, vRes1_s16); vst1q_s16(_dst + i + 0, vRes1_s16);
@ -1038,7 +1039,7 @@ CVTS_FUNC(s8, s32, 16,
#else #else
CVTS_FUNC(s8, s32, 16, CVTS_FUNC(s8, s32, 16,
float32x4_t vscale = vdupq_n_f32((f32)alpha); float32x4_t vscale = vdupq_n_f32((f32)alpha);
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, float32x4_t vshift = vdupq_n_f32((f32)beta);,
{ {
for (size_t i = 0; i < w; i += 16) for (size_t i = 0; i < w; i += 16)
{ {
@ -1062,10 +1063,10 @@ CVTS_FUNC(s8, s32, 16,
vline2_f32 = vaddq_f32(vline2_f32, vshift); vline2_f32 = vaddq_f32(vline2_f32, vshift);
vline3_f32 = vaddq_f32(vline3_f32, vshift); vline3_f32 = vaddq_f32(vline3_f32, vshift);
vline4_f32 = vaddq_f32(vline4_f32, vshift); vline4_f32 = vaddq_f32(vline4_f32, vshift);
vline1_s32 = vcvtq_s32_f32(vline1_f32); vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
vline2_s32 = vcvtq_s32_f32(vline2_f32); vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
vline3_s32 = vcvtq_s32_f32(vline3_f32); vline3_s32 = internal::vroundq_s32_f32(vline3_f32);
vline4_s32 = vcvtq_s32_f32(vline4_f32); vline4_s32 = internal::vroundq_s32_f32(vline4_f32);
vst1q_s32(_dst + i + 0, vline1_s32); vst1q_s32(_dst + i + 0, vline1_s32);
vst1q_s32(_dst + i + 4, vline2_s32); vst1q_s32(_dst + i + 4, vline2_s32);
vst1q_s32(_dst + i + 8, vline3_s32); vst1q_s32(_dst + i + 8, vline3_s32);
@ -1190,7 +1191,7 @@ CVTS_FUNC(u16, u8, 16,
#else #else
CVTS_FUNC(u16, u8, 16, CVTS_FUNC(u16, u8, 16,
float32x4_t vscale = vdupq_n_f32((f32)alpha); float32x4_t vscale = vdupq_n_f32((f32)alpha);
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, float32x4_t vshift = vdupq_n_f32((f32)beta);,
{ {
for (size_t i = 0; i < w; i += 8) for (size_t i = 0; i < w; i += 8)
{ {
@ -1204,8 +1205,8 @@ CVTS_FUNC(u16, u8, 16,
vline2_f32 = vmulq_f32(vline2_f32, vscale); vline2_f32 = vmulq_f32(vline2_f32, vscale);
vline1_f32 = vaddq_f32(vline1_f32, vshift); vline1_f32 = vaddq_f32(vline1_f32, vshift);
vline2_f32 = vaddq_f32(vline2_f32, vshift); vline2_f32 = vaddq_f32(vline2_f32, vshift);
int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32); int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32); int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
int16x4_t vRes1 = vqmovn_s32(vline1_s32); int16x4_t vRes1 = vqmovn_s32(vline1_s32);
int16x4_t vRes2 = vqmovn_s32(vline2_s32); int16x4_t vRes2 = vqmovn_s32(vline2_s32);
uint8x8_t vRes = vqmovun_s16(vcombine_s16(vRes1, vRes2)); uint8x8_t vRes = vqmovun_s16(vcombine_s16(vRes1, vRes2));
@ -1249,7 +1250,7 @@ CVTS_FUNC(u16, s8, 16,
#else #else
CVTS_FUNC(u16, s8, 16, CVTS_FUNC(u16, s8, 16,
float32x4_t vscale = vdupq_n_f32((f32)alpha); float32x4_t vscale = vdupq_n_f32((f32)alpha);
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, float32x4_t vshift = vdupq_n_f32((f32)beta);,
{ {
for (size_t i = 0; i < w; i += 8) for (size_t i = 0; i < w; i += 8)
{ {
@ -1263,8 +1264,8 @@ CVTS_FUNC(u16, s8, 16,
vline2_f32 = vmulq_f32(vline2_f32, vscale); vline2_f32 = vmulq_f32(vline2_f32, vscale);
vline1_f32 = vaddq_f32(vline1_f32, vshift); vline1_f32 = vaddq_f32(vline1_f32, vshift);
vline2_f32 = vaddq_f32(vline2_f32, vshift); vline2_f32 = vaddq_f32(vline2_f32, vshift);
int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32); int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32); int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
int16x4_t vRes1 = vqmovn_s32(vline1_s32); int16x4_t vRes1 = vqmovn_s32(vline1_s32);
int16x4_t vRes2 = vqmovn_s32(vline2_s32); int16x4_t vRes2 = vqmovn_s32(vline2_s32);
int8x8_t vRes = vqmovn_s16(vcombine_s16(vRes1, vRes2)); int8x8_t vRes = vqmovn_s16(vcombine_s16(vRes1, vRes2));
@ -1307,7 +1308,7 @@ CVTS_FUNC1(u16, 16,
#else #else
CVTS_FUNC1(u16, 16, CVTS_FUNC1(u16, 16,
float32x4_t vscale = vdupq_n_f32((f32)alpha); float32x4_t vscale = vdupq_n_f32((f32)alpha);
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, float32x4_t vshift = vdupq_n_f32((f32)beta);,
{ {
for (size_t i = 0; i < w; i += 8) for (size_t i = 0; i < w; i += 8)
{ {
@ -1321,8 +1322,8 @@ CVTS_FUNC1(u16, 16,
vline2_f32 = vmulq_f32(vline2_f32, vscale); vline2_f32 = vmulq_f32(vline2_f32, vscale);
vline1_f32 = vaddq_f32(vline1_f32, vshift); vline1_f32 = vaddq_f32(vline1_f32, vshift);
vline2_f32 = vaddq_f32(vline2_f32, vshift); vline2_f32 = vaddq_f32(vline2_f32, vshift);
int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32); int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32); int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
uint16x4_t vRes1 = vqmovun_s32(vline1_s32); uint16x4_t vRes1 = vqmovun_s32(vline1_s32);
uint16x4_t vRes2 = vqmovun_s32(vline2_s32); uint16x4_t vRes2 = vqmovun_s32(vline2_s32);
vst1q_u16(_dst + i, vcombine_u16(vRes1, vRes2)); vst1q_u16(_dst + i, vcombine_u16(vRes1, vRes2));
@ -1364,7 +1365,7 @@ CVTS_FUNC(u16, s16, 8,
#else #else
CVTS_FUNC(u16, s16, 8, CVTS_FUNC(u16, s16, 8,
float32x4_t vscale = vdupq_n_f32((f32)alpha); float32x4_t vscale = vdupq_n_f32((f32)alpha);
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, float32x4_t vshift = vdupq_n_f32((f32)beta);,
{ {
for (size_t i = 0; i < w; i += 8) for (size_t i = 0; i < w; i += 8)
{ {
@ -1378,8 +1379,8 @@ CVTS_FUNC(u16, s16, 8,
vline2_f32 = vmulq_f32(vline2_f32, vscale); vline2_f32 = vmulq_f32(vline2_f32, vscale);
vline1_f32 = vaddq_f32(vline1_f32, vshift); vline1_f32 = vaddq_f32(vline1_f32, vshift);
vline2_f32 = vaddq_f32(vline2_f32, vshift); vline2_f32 = vaddq_f32(vline2_f32, vshift);
int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32); int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32); int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
int16x4_t vRes1 = vqmovn_s32(vline1_s32); int16x4_t vRes1 = vqmovn_s32(vline1_s32);
int16x4_t vRes2 = vqmovn_s32(vline2_s32); int16x4_t vRes2 = vqmovn_s32(vline2_s32);
vst1q_s16(_dst + i, vcombine_s16(vRes1, vRes2)); vst1q_s16(_dst + i, vcombine_s16(vRes1, vRes2));
@ -1421,7 +1422,7 @@ CVTS_FUNC(u16, s32, 8,
#else #else
CVTS_FUNC(u16, s32, 8, CVTS_FUNC(u16, s32, 8,
float32x4_t vscale = vdupq_n_f32((f32)alpha); float32x4_t vscale = vdupq_n_f32((f32)alpha);
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, float32x4_t vshift = vdupq_n_f32((f32)beta);,
{ {
for (size_t i = 0; i < w; i += 8) for (size_t i = 0; i < w; i += 8)
{ {
@ -1435,8 +1436,8 @@ CVTS_FUNC(u16, s32, 8,
vline2_f32 = vmulq_f32(vline2_f32, vscale); vline2_f32 = vmulq_f32(vline2_f32, vscale);
vline1_f32 = vaddq_f32(vline1_f32, vshift); vline1_f32 = vaddq_f32(vline1_f32, vshift);
vline2_f32 = vaddq_f32(vline2_f32, vshift); vline2_f32 = vaddq_f32(vline2_f32, vshift);
int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32); int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32); int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
vst1q_s32(_dst + i + 0, vline1_s32); vst1q_s32(_dst + i + 0, vline1_s32);
vst1q_s32(_dst + i + 4, vline2_s32); vst1q_s32(_dst + i + 4, vline2_s32);
} }
@ -1530,7 +1531,7 @@ CVTS_FUNC(s16, u8, 16,
#else #else
CVTS_FUNC(s16, u8, 16, CVTS_FUNC(s16, u8, 16,
float32x4_t vscale = vdupq_n_f32((f32)alpha); float32x4_t vscale = vdupq_n_f32((f32)alpha);
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, float32x4_t vshift = vdupq_n_f32((f32)beta);,
{ {
for (size_t i = 0; i < w; i += 8) for (size_t i = 0; i < w; i += 8)
{ {
@ -1544,8 +1545,8 @@ CVTS_FUNC(s16, u8, 16,
vline2_f32 = vmulq_f32(vline2_f32, vscale); vline2_f32 = vmulq_f32(vline2_f32, vscale);
vline1_f32 = vaddq_f32(vline1_f32, vshift); vline1_f32 = vaddq_f32(vline1_f32, vshift);
vline2_f32 = vaddq_f32(vline2_f32, vshift); vline2_f32 = vaddq_f32(vline2_f32, vshift);
vline1_s32 = vcvtq_s32_f32(vline1_f32); vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
vline2_s32 = vcvtq_s32_f32(vline2_f32); vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
int16x4_t vRes1 = vqmovn_s32(vline1_s32); int16x4_t vRes1 = vqmovn_s32(vline1_s32);
int16x4_t vRes2 = vqmovn_s32(vline2_s32); int16x4_t vRes2 = vqmovn_s32(vline2_s32);
uint8x8_t vRes = vqmovun_s16(vcombine_s16(vRes1, vRes2)); uint8x8_t vRes = vqmovun_s16(vcombine_s16(vRes1, vRes2));
@ -1589,7 +1590,7 @@ CVTS_FUNC(s16, s8, 16,
#else #else
CVTS_FUNC(s16, s8, 16, CVTS_FUNC(s16, s8, 16,
float32x4_t vscale = vdupq_n_f32((f32)alpha); float32x4_t vscale = vdupq_n_f32((f32)alpha);
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, float32x4_t vshift = vdupq_n_f32((f32)beta);,
{ {
for (size_t i = 0; i < w; i += 8) for (size_t i = 0; i < w; i += 8)
{ {
@ -1603,8 +1604,8 @@ CVTS_FUNC(s16, s8, 16,
vline2_f32 = vmulq_f32(vline2_f32, vscale); vline2_f32 = vmulq_f32(vline2_f32, vscale);
vline1_f32 = vaddq_f32(vline1_f32, vshift); vline1_f32 = vaddq_f32(vline1_f32, vshift);
vline2_f32 = vaddq_f32(vline2_f32, vshift); vline2_f32 = vaddq_f32(vline2_f32, vshift);
vline1_s32 = vcvtq_s32_f32(vline1_f32); vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
vline2_s32 = vcvtq_s32_f32(vline2_f32); vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
int16x4_t vRes1 = vqmovn_s32(vline1_s32); int16x4_t vRes1 = vqmovn_s32(vline1_s32);
int16x4_t vRes2 = vqmovn_s32(vline2_s32); int16x4_t vRes2 = vqmovn_s32(vline2_s32);
int8x8_t vRes = vqmovn_s16(vcombine_s16(vRes1, vRes2)); int8x8_t vRes = vqmovn_s16(vcombine_s16(vRes1, vRes2));
@ -1647,7 +1648,7 @@ CVTS_FUNC(s16, u16, 8,
#else #else
CVTS_FUNC(s16, u16, 8, CVTS_FUNC(s16, u16, 8,
float32x4_t vscale = vdupq_n_f32((f32)alpha); float32x4_t vscale = vdupq_n_f32((f32)alpha);
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, float32x4_t vshift = vdupq_n_f32((f32)beta);,
{ {
for (size_t i = 0; i < w; i += 8) for (size_t i = 0; i < w; i += 8)
{ {
@ -1661,8 +1662,8 @@ CVTS_FUNC(s16, u16, 8,
vline2_f32 = vmulq_f32(vline2_f32, vscale); vline2_f32 = vmulq_f32(vline2_f32, vscale);
vline1_f32 = vaddq_f32(vline1_f32, vshift); vline1_f32 = vaddq_f32(vline1_f32, vshift);
vline2_f32 = vaddq_f32(vline2_f32, vshift); vline2_f32 = vaddq_f32(vline2_f32, vshift);
vline1_s32 = vcvtq_s32_f32(vline1_f32); vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
vline2_s32 = vcvtq_s32_f32(vline2_f32); vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
uint16x4_t vRes1 = vqmovun_s32(vline1_s32); uint16x4_t vRes1 = vqmovun_s32(vline1_s32);
uint16x4_t vRes2 = vqmovun_s32(vline2_s32); uint16x4_t vRes2 = vqmovun_s32(vline2_s32);
vst1q_u16(_dst + i, vcombine_u16(vRes1, vRes2)); vst1q_u16(_dst + i, vcombine_u16(vRes1, vRes2));
@ -1704,7 +1705,7 @@ CVTS_FUNC1(s16, 16,
#else #else
CVTS_FUNC1(s16, 16, CVTS_FUNC1(s16, 16,
float32x4_t vscale = vdupq_n_f32((f32)alpha); float32x4_t vscale = vdupq_n_f32((f32)alpha);
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, float32x4_t vshift = vdupq_n_f32((f32)beta);,
{ {
for (size_t i = 0; i < w; i += 8) for (size_t i = 0; i < w; i += 8)
{ {
@ -1718,8 +1719,8 @@ CVTS_FUNC1(s16, 16,
vline2_f32 = vmulq_f32(vline2_f32, vscale); vline2_f32 = vmulq_f32(vline2_f32, vscale);
vline1_f32 = vaddq_f32(vline1_f32, vshift); vline1_f32 = vaddq_f32(vline1_f32, vshift);
vline2_f32 = vaddq_f32(vline2_f32, vshift); vline2_f32 = vaddq_f32(vline2_f32, vshift);
vline1_s32 = vcvtq_s32_f32(vline1_f32); vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
vline2_s32 = vcvtq_s32_f32(vline2_f32); vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
int16x4_t vRes1 = vqmovn_s32(vline1_s32); int16x4_t vRes1 = vqmovn_s32(vline1_s32);
int16x4_t vRes2 = vqmovn_s32(vline2_s32); int16x4_t vRes2 = vqmovn_s32(vline2_s32);
vst1q_s16(_dst + i, vcombine_s16(vRes1, vRes2)); vst1q_s16(_dst + i, vcombine_s16(vRes1, vRes2));
@ -1761,7 +1762,7 @@ CVTS_FUNC(s16, s32, 8,
#else #else
CVTS_FUNC(s16, s32, 8, CVTS_FUNC(s16, s32, 8,
float32x4_t vscale = vdupq_n_f32((f32)alpha); float32x4_t vscale = vdupq_n_f32((f32)alpha);
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, float32x4_t vshift = vdupq_n_f32((f32)beta);,
{ {
for (size_t i = 0; i < w; i += 8) for (size_t i = 0; i < w; i += 8)
{ {
@ -1775,8 +1776,8 @@ CVTS_FUNC(s16, s32, 8,
vline2_f32 = vmulq_f32(vline2_f32, vscale); vline2_f32 = vmulq_f32(vline2_f32, vscale);
vline1_f32 = vaddq_f32(vline1_f32, vshift); vline1_f32 = vaddq_f32(vline1_f32, vshift);
vline2_f32 = vaddq_f32(vline2_f32, vshift); vline2_f32 = vaddq_f32(vline2_f32, vshift);
vline1_s32 = vcvtq_s32_f32(vline1_f32); vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
vline2_s32 = vcvtq_s32_f32(vline2_f32); vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
vst1q_s32(_dst + i + 0, vline1_s32); vst1q_s32(_dst + i + 0, vline1_s32);
vst1q_s32(_dst + i + 4, vline2_s32); vst1q_s32(_dst + i + 4, vline2_s32);
} }
@ -1870,7 +1871,7 @@ CVTS_FUNC(s32, u8, 8,
#else #else
CVTS_FUNC(s32, u8, 8, CVTS_FUNC(s32, u8, 8,
float32x4_t vscale = vdupq_n_f32((f32)alpha); float32x4_t vscale = vdupq_n_f32((f32)alpha);
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, float32x4_t vshift = vdupq_n_f32((f32)beta);,
{ {
for (size_t i = 0; i < w; i += 8) for (size_t i = 0; i < w; i += 8)
{ {
@ -1883,8 +1884,8 @@ CVTS_FUNC(s32, u8, 8,
vline2_f32 = vmulq_f32(vline2_f32, vscale); vline2_f32 = vmulq_f32(vline2_f32, vscale);
vline1_f32 = vaddq_f32(vline1_f32, vshift); vline1_f32 = vaddq_f32(vline1_f32, vshift);
vline2_f32 = vaddq_f32(vline2_f32, vshift); vline2_f32 = vaddq_f32(vline2_f32, vshift);
vline1_s32 = vcvtq_s32_f32(vline1_f32); vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
vline2_s32 = vcvtq_s32_f32(vline2_f32); vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
uint16x4_t vRes1 = vqmovun_s32(vline1_s32); uint16x4_t vRes1 = vqmovun_s32(vline1_s32);
uint16x4_t vRes2 = vqmovun_s32(vline2_s32); uint16x4_t vRes2 = vqmovun_s32(vline2_s32);
uint8x8_t vRes = vqmovn_u16(vcombine_u16(vRes1, vRes2)); uint8x8_t vRes = vqmovn_u16(vcombine_u16(vRes1, vRes2));
@ -1928,7 +1929,7 @@ CVTS_FUNC(s32, s8, 8,
#else #else
CVTS_FUNC(s32, s8, 8, CVTS_FUNC(s32, s8, 8,
float32x4_t vscale = vdupq_n_f32((f32)alpha); float32x4_t vscale = vdupq_n_f32((f32)alpha);
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, float32x4_t vshift = vdupq_n_f32((f32)beta);,
{ {
for (size_t i = 0; i < w; i += 8) for (size_t i = 0; i < w; i += 8)
{ {
@ -1941,8 +1942,8 @@ CVTS_FUNC(s32, s8, 8,
vline2_f32 = vmulq_f32(vline2_f32, vscale); vline2_f32 = vmulq_f32(vline2_f32, vscale);
vline1_f32 = vaddq_f32(vline1_f32, vshift); vline1_f32 = vaddq_f32(vline1_f32, vshift);
vline2_f32 = vaddq_f32(vline2_f32, vshift); vline2_f32 = vaddq_f32(vline2_f32, vshift);
vline1_s32 = vcvtq_s32_f32(vline1_f32); vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
vline2_s32 = vcvtq_s32_f32(vline2_f32); vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
int16x4_t vRes1 = vqmovn_s32(vline1_s32); int16x4_t vRes1 = vqmovn_s32(vline1_s32);
int16x4_t vRes2 = vqmovn_s32(vline2_s32); int16x4_t vRes2 = vqmovn_s32(vline2_s32);
int8x8_t vRes = vqmovn_s16(vcombine_s16(vRes1, vRes2)); int8x8_t vRes = vqmovn_s16(vcombine_s16(vRes1, vRes2));
@ -1985,7 +1986,7 @@ CVTS_FUNC(s32, u16, 8,
#else #else
CVTS_FUNC(s32, u16, 8, CVTS_FUNC(s32, u16, 8,
float32x4_t vscale = vdupq_n_f32((f32)alpha); float32x4_t vscale = vdupq_n_f32((f32)alpha);
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, float32x4_t vshift = vdupq_n_f32((f32)beta);,
{ {
for (size_t i = 0; i < w; i += 8) for (size_t i = 0; i < w; i += 8)
{ {
@ -1998,8 +1999,8 @@ CVTS_FUNC(s32, u16, 8,
vline2_f32 = vmulq_f32(vline2_f32, vscale); vline2_f32 = vmulq_f32(vline2_f32, vscale);
vline1_f32 = vaddq_f32(vline1_f32, vshift); vline1_f32 = vaddq_f32(vline1_f32, vshift);
vline2_f32 = vaddq_f32(vline2_f32, vshift); vline2_f32 = vaddq_f32(vline2_f32, vshift);
vline1_s32 = vcvtq_s32_f32(vline1_f32); vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
vline2_s32 = vcvtq_s32_f32(vline2_f32); vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
uint16x4_t vRes1 = vqmovun_s32(vline1_s32); uint16x4_t vRes1 = vqmovun_s32(vline1_s32);
uint16x4_t vRes2 = vqmovun_s32(vline2_s32); uint16x4_t vRes2 = vqmovun_s32(vline2_s32);
vst1q_u16(_dst + i, vcombine_u16(vRes1, vRes2)); vst1q_u16(_dst + i, vcombine_u16(vRes1, vRes2));
@ -2041,7 +2042,7 @@ CVTS_FUNC(s32, s16, 8,
#else #else
CVTS_FUNC(s32, s16, 8, CVTS_FUNC(s32, s16, 8,
float32x4_t vscale = vdupq_n_f32((f32)alpha); float32x4_t vscale = vdupq_n_f32((f32)alpha);
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, float32x4_t vshift = vdupq_n_f32((f32)beta);,
{ {
for (size_t i = 0; i < w; i += 8) for (size_t i = 0; i < w; i += 8)
{ {
@ -2054,8 +2055,8 @@ CVTS_FUNC(s32, s16, 8,
vline2_f32 = vmulq_f32(vline2_f32, vscale); vline2_f32 = vmulq_f32(vline2_f32, vscale);
vline1_f32 = vaddq_f32(vline1_f32, vshift); vline1_f32 = vaddq_f32(vline1_f32, vshift);
vline2_f32 = vaddq_f32(vline2_f32, vshift); vline2_f32 = vaddq_f32(vline2_f32, vshift);
vline1_s32 = vcvtq_s32_f32(vline1_f32); vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
vline2_s32 = vcvtq_s32_f32(vline2_f32); vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
int16x4_t vRes1 = vqmovn_s32(vline1_s32); int16x4_t vRes1 = vqmovn_s32(vline1_s32);
int16x4_t vRes2 = vqmovn_s32(vline2_s32); int16x4_t vRes2 = vqmovn_s32(vline2_s32);
vst1q_s16(_dst + i, vcombine_s16(vRes1, vRes2)); vst1q_s16(_dst + i, vcombine_s16(vRes1, vRes2));
@ -2097,7 +2098,7 @@ CVTS_FUNC1(s32, 8,
#else #else
CVTS_FUNC1(s32, 8, CVTS_FUNC1(s32, 8,
float32x4_t vscale = vdupq_n_f32((f32)alpha); float32x4_t vscale = vdupq_n_f32((f32)alpha);
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, float32x4_t vshift = vdupq_n_f32((f32)beta);,
{ {
for (size_t i = 0; i < w; i += 8) for (size_t i = 0; i < w; i += 8)
{ {
@ -2110,8 +2111,8 @@ CVTS_FUNC1(s32, 8,
vline2_f32 = vmulq_f32(vline2_f32, vscale); vline2_f32 = vmulq_f32(vline2_f32, vscale);
vline1_f32 = vaddq_f32(vline1_f32, vshift); vline1_f32 = vaddq_f32(vline1_f32, vshift);
vline2_f32 = vaddq_f32(vline2_f32, vshift); vline2_f32 = vaddq_f32(vline2_f32, vshift);
vline1_s32 = vcvtq_s32_f32(vline1_f32); vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
vline2_s32 = vcvtq_s32_f32(vline2_f32); vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
vst1q_s32(_dst + i + 0, vline1_s32); vst1q_s32(_dst + i + 0, vline1_s32);
vst1q_s32(_dst + i + 4, vline2_s32); vst1q_s32(_dst + i + 4, vline2_s32);
} }
@ -2272,7 +2273,7 @@ CVTS_FUNC(f32, s8, 8,
#else #else
CVTS_FUNC(f32, s8, 8, CVTS_FUNC(f32, s8, 8,
float32x4_t vscale = vdupq_n_f32((f32)alpha); float32x4_t vscale = vdupq_n_f32((f32)alpha);
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, float32x4_t vshift = vdupq_n_f32((f32)beta);,
{ {
for (size_t i = 0; i < w; i += 8) for (size_t i = 0; i < w; i += 8)
{ {
@ -2283,8 +2284,8 @@ CVTS_FUNC(f32, s8, 8,
vline2_f32 = vmulq_f32(vline2_f32, vscale); vline2_f32 = vmulq_f32(vline2_f32, vscale);
vline1_f32 = vaddq_f32(vline1_f32, vshift); vline1_f32 = vaddq_f32(vline1_f32, vshift);
vline2_f32 = vaddq_f32(vline2_f32, vshift); vline2_f32 = vaddq_f32(vline2_f32, vshift);
int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32); int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32); int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
int16x4_t vRes1 = vqmovn_s32(vline1_s32); int16x4_t vRes1 = vqmovn_s32(vline1_s32);
int16x4_t vRes2 = vqmovn_s32(vline2_s32); int16x4_t vRes2 = vqmovn_s32(vline2_s32);
int8x8_t vRes = vqmovn_s16(vcombine_s16(vRes1, vRes2)); int8x8_t vRes = vqmovn_s16(vcombine_s16(vRes1, vRes2));
@ -2325,7 +2326,7 @@ CVTS_FUNC(f32, u16, 8,
#else #else
CVTS_FUNC(f32, u16, 8, CVTS_FUNC(f32, u16, 8,
float32x4_t vscale = vdupq_n_f32((f32)alpha); float32x4_t vscale = vdupq_n_f32((f32)alpha);
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, float32x4_t vshift = vdupq_n_f32((f32)beta);,
{ {
for (size_t i = 0; i < w; i += 8) for (size_t i = 0; i < w; i += 8)
{ {
@ -2336,8 +2337,8 @@ CVTS_FUNC(f32, u16, 8,
vline2_f32 = vmulq_f32(vline2_f32, vscale); vline2_f32 = vmulq_f32(vline2_f32, vscale);
vline1_f32 = vaddq_f32(vline1_f32, vshift); vline1_f32 = vaddq_f32(vline1_f32, vshift);
vline2_f32 = vaddq_f32(vline2_f32, vshift); vline2_f32 = vaddq_f32(vline2_f32, vshift);
uint32x4_t vline1_u32 = vcvtq_u32_f32(vline1_f32); uint32x4_t vline1_u32 = internal::vroundq_u32_f32(vline1_f32);
uint32x4_t vline2_u32 = vcvtq_u32_f32(vline2_f32); uint32x4_t vline2_u32 = internal::vroundq_u32_f32(vline2_f32);
uint16x4_t vRes1 = vqmovn_u32(vline1_u32); uint16x4_t vRes1 = vqmovn_u32(vline1_u32);
uint16x4_t vRes2 = vqmovn_u32(vline2_u32); uint16x4_t vRes2 = vqmovn_u32(vline2_u32);
vst1q_u16(_dst + i, vcombine_u16(vRes1, vRes2)); vst1q_u16(_dst + i, vcombine_u16(vRes1, vRes2));
@ -2377,7 +2378,7 @@ CVTS_FUNC(f32, s16, 8,
#else #else
CVTS_FUNC(f32, s16, 8, CVTS_FUNC(f32, s16, 8,
float32x4_t vscale = vdupq_n_f32((f32)alpha); float32x4_t vscale = vdupq_n_f32((f32)alpha);
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, float32x4_t vshift = vdupq_n_f32((f32)beta);,
{ {
for (size_t i = 0; i < w; i += 8) for (size_t i = 0; i < w; i += 8)
{ {
@ -2388,8 +2389,8 @@ CVTS_FUNC(f32, s16, 8,
vline2_f32 = vmulq_f32(vline2_f32, vscale); vline2_f32 = vmulq_f32(vline2_f32, vscale);
vline1_f32 = vaddq_f32(vline1_f32, vshift); vline1_f32 = vaddq_f32(vline1_f32, vshift);
vline2_f32 = vaddq_f32(vline2_f32, vshift); vline2_f32 = vaddq_f32(vline2_f32, vshift);
int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32); int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32); int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
int16x4_t vRes1 = vqmovn_s32(vline1_s32); int16x4_t vRes1 = vqmovn_s32(vline1_s32);
int16x4_t vRes2 = vqmovn_s32(vline2_s32); int16x4_t vRes2 = vqmovn_s32(vline2_s32);
vst1q_s16(_dst + i, vcombine_s16(vRes1, vRes2)); vst1q_s16(_dst + i, vcombine_s16(vRes1, vRes2));
@ -2429,7 +2430,7 @@ CVTS_FUNC(f32, s32, 8,
#else #else
CVTS_FUNC(f32, s32, 8, CVTS_FUNC(f32, s32, 8,
float32x4_t vscale = vdupq_n_f32((f32)alpha); float32x4_t vscale = vdupq_n_f32((f32)alpha);
float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, float32x4_t vshift = vdupq_n_f32((f32)beta);,
{ {
for (size_t i = 0; i < w; i += 8) for (size_t i = 0; i < w; i += 8)
{ {
@ -2440,8 +2441,8 @@ CVTS_FUNC(f32, s32, 8,
vline2_f32 = vmulq_f32(vline2_f32, vscale); vline2_f32 = vmulq_f32(vline2_f32, vscale);
vline1_f32 = vaddq_f32(vline1_f32, vshift); vline1_f32 = vaddq_f32(vline1_f32, vshift);
vline2_f32 = vaddq_f32(vline2_f32, vshift); vline2_f32 = vaddq_f32(vline2_f32, vshift);
int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32); int32x4_t vline1_s32 = internal::vroundq_s32_f32(vline1_f32);
int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32); int32x4_t vline2_s32 = internal::vroundq_s32_f32(vline2_f32);
vst1q_s32(_dst + i + 0, vline1_s32); vst1q_s32(_dst + i + 0, vline1_s32);
vst1q_s32(_dst + i + 4, vline2_s32); vst1q_s32(_dst + i + 4, vline2_s32);
} }

View File

@ -39,6 +39,7 @@
#include "common.hpp" #include "common.hpp"
#include "vtransform.hpp" #include "vtransform.hpp"
#include "vround_helper.hpp"
#include <cstring> #include <cstring>
#include <cfloat> #include <cfloat>
@ -51,13 +52,6 @@ namespace {
#ifdef CAROTENE_NEON #ifdef CAROTENE_NEON
inline float32x4_t vroundq(const float32x4_t& v)
{
const int32x4_t signMask = vdupq_n_s32(1 << 31), half = vreinterpretq_s32_f32(vdupq_n_f32(0.5f));
float32x4_t v_addition = vreinterpretq_f32_s32(vorrq_s32(half, vandq_s32(signMask, vreinterpretq_s32_f32(v))));
return vaddq_f32(v, v_addition);
}
template <typename T> template <typename T>
inline T divSaturateQ(const T &v1, const T &v2, const float scale) inline T divSaturateQ(const T &v1, const T &v2, const float scale)
{ {
@ -69,17 +63,10 @@ inline T divSaturateQ(const T &v1, const T &v2, const float scale)
} }
template <> template <>
inline int32x4_t divSaturateQ<int32x4_t>(const int32x4_t &v1, const int32x4_t &v2, const float scale) inline int32x4_t divSaturateQ<int32x4_t>(const int32x4_t &v1, const int32x4_t &v2, const float scale)
{ return vcvtq_s32_f32(vroundq(vmulq_f32(vmulq_n_f32(vcvtq_f32_s32(v1), scale), internal::vrecpq_f32(vcvtq_f32_s32(v2))))); } { return internal::vroundq_s32_f32(vmulq_f32(vmulq_n_f32(vcvtq_f32_s32(v1), scale), internal::vrecpq_f32(vcvtq_f32_s32(v2)))); }
template <> template <>
inline uint32x4_t divSaturateQ<uint32x4_t>(const uint32x4_t &v1, const uint32x4_t &v2, const float scale) inline uint32x4_t divSaturateQ<uint32x4_t>(const uint32x4_t &v1, const uint32x4_t &v2, const float scale)
{ return vcvtq_u32_f32(vroundq(vmulq_f32(vmulq_n_f32(vcvtq_f32_u32(v1), scale), internal::vrecpq_f32(vcvtq_f32_u32(v2))))); } { return internal::vroundq_u32_f32(vmulq_f32(vmulq_n_f32(vcvtq_f32_u32(v1), scale), internal::vrecpq_f32(vcvtq_f32_u32(v2)))); }
inline float32x2_t vround(const float32x2_t& v)
{
const int32x2_t signMask = vdup_n_s32(1 << 31), half = vreinterpret_s32_f32(vdup_n_f32(0.5f));
float32x2_t v_addition = vreinterpret_f32_s32(vorr_s32(half, vand_s32(signMask, vreinterpret_s32_f32(v))));
return vadd_f32(v, v_addition);
}
template <typename T> template <typename T>
inline T divSaturate(const T &v1, const T &v2, const float scale) inline T divSaturate(const T &v1, const T &v2, const float scale)
@ -88,10 +75,10 @@ inline T divSaturate(const T &v1, const T &v2, const float scale)
} }
template <> template <>
inline int32x2_t divSaturate<int32x2_t>(const int32x2_t &v1, const int32x2_t &v2, const float scale) inline int32x2_t divSaturate<int32x2_t>(const int32x2_t &v1, const int32x2_t &v2, const float scale)
{ return vcvt_s32_f32(vround(vmul_f32(vmul_n_f32(vcvt_f32_s32(v1), scale), internal::vrecp_f32(vcvt_f32_s32(v2))))); } { return internal::vround_s32_f32(vmul_f32(vmul_n_f32(vcvt_f32_s32(v1), scale), internal::vrecp_f32(vcvt_f32_s32(v2)))); }
template <> template <>
inline uint32x2_t divSaturate<uint32x2_t>(const uint32x2_t &v1, const uint32x2_t &v2, const float scale) inline uint32x2_t divSaturate<uint32x2_t>(const uint32x2_t &v1, const uint32x2_t &v2, const float scale)
{ return vcvt_u32_f32(vround(vmul_f32(vmul_n_f32(vcvt_f32_u32(v1), scale), internal::vrecp_f32(vcvt_f32_u32(v2))))); } { return internal::vround_u32_f32(vmul_f32(vmul_n_f32(vcvt_f32_u32(v1), scale), internal::vrecp_f32(vcvt_f32_u32(v2)))); }
template <typename T> template <typename T>

View File

@ -41,6 +41,7 @@
#include <cmath> #include <cmath>
#include "common.hpp" #include "common.hpp"
#include "vround_helper.hpp"
namespace CAROTENE_NS { namespace CAROTENE_NS {
@ -121,8 +122,6 @@ void phase(const Size2D &size,
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
float32x4_t v_05 = vdupq_n_f32(0.5f);
for (size_t i = 0; i < size.height; ++i) for (size_t i = 0; i < size.height; ++i)
{ {
const s16 * src0 = internal::getRowPtr(src0Base, src0Stride, i); const s16 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
@ -149,8 +148,8 @@ void phase(const Size2D &size,
float32x4_t v_dst32f1; float32x4_t v_dst32f1;
FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1) FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1)
uint16x8_t v_dst16s0 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f0, v_05))), uint16x8_t v_dst16s0 = vcombine_u16(vmovn_u32(internal::vroundq_u32_f32(v_dst32f0)),
vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f1, v_05)))); vmovn_u32(internal::vroundq_u32_f32(v_dst32f1)));
// 1 // 1
v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src01))); v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src01)));
@ -161,8 +160,8 @@ void phase(const Size2D &size,
v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src11))); v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src11)));
FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1) FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1)
uint16x8_t v_dst16s1 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f0, v_05))), uint16x8_t v_dst16s1 = vcombine_u16(vmovn_u32(internal::vroundq_u32_f32(v_dst32f0)),
vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f1, v_05)))); vmovn_u32(internal::vroundq_u32_f32(v_dst32f1)));
vst1q_u8(dst + j, vcombine_u8(vmovn_u16(v_dst16s0), vst1q_u8(dst + j, vcombine_u8(vmovn_u16(v_dst16s0),
vmovn_u16(v_dst16s1))); vmovn_u16(v_dst16s1)));
@ -182,8 +181,8 @@ void phase(const Size2D &size,
float32x4_t v_dst32f1; float32x4_t v_dst32f1;
FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1) FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1)
uint16x8_t v_dst = vcombine_u16(vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f0, v_05))), uint16x8_t v_dst = vcombine_u16(vmovn_u32(internal::vroundq_u32_f32(v_dst32f0)),
vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f1, v_05)))); vmovn_u32(internal::vroundq_u32_f32(v_dst32f1)));
vst1_u8(dst + j, vmovn_u16(v_dst)); vst1_u8(dst + j, vmovn_u16(v_dst));
} }

102
3rdparty/carotene/src/vround_helper.hpp vendored Normal file
View File

@ -0,0 +1,102 @@
/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#ifndef CAROTENE_SRC_VROUND_HELPER_HPP
#define CAROTENE_SRC_VROUND_HELPER_HPP
#include "common.hpp"
#include "vtransform.hpp"
#ifdef CAROTENE_NEON
/**
* This helper header is for rounding from float32xN to uin32xN or int32xN to nearest, ties to even.
* See https://en.wikipedia.org/wiki/Rounding#Rounding_half_to_even
*/
// See https://github.com/opencv/opencv/pull/24271#issuecomment-1867318007
#define CAROTENE_ROUND_DELTA (12582912.0f)
namespace CAROTENE_NS { namespace internal {
inline uint32x4_t vroundq_u32_f32(const float32x4_t val)
{
#if CAROTENE_NEON_ARCH >= 8 /* get ready for ARMv9 */
return vcvtnq_u32_f32(val);
#else
const float32x4_t delta = vdupq_n_f32(CAROTENE_ROUND_DELTA);
return vcvtq_u32_f32(vsubq_f32(vaddq_f32(val, delta), delta));
#endif
}
inline uint32x2_t vround_u32_f32(const float32x2_t val)
{
#if CAROTENE_NEON_ARCH >= 8 /* get ready for ARMv9 */
return vcvtn_u32_f32(val);
#else
const float32x2_t delta = vdup_n_f32(CAROTENE_ROUND_DELTA);
return vcvt_u32_f32(vsub_f32(vadd_f32(val, delta), delta));
#endif
}
inline int32x4_t vroundq_s32_f32(const float32x4_t val)
{
#if CAROTENE_NEON_ARCH >= 8 /* get ready for ARMv9 */
return vcvtnq_s32_f32(val);
#else
const float32x4_t delta = vdupq_n_f32(CAROTENE_ROUND_DELTA);
return vcvtq_s32_f32(vsubq_f32(vaddq_f32(val, delta), delta));
#endif
}
inline int32x2_t vround_s32_f32(const float32x2_t val)
{
#if CAROTENE_NEON_ARCH >= 8 /* get ready for ARMv9 */
return vcvtn_s32_f32(val);
#else
const float32x2_t delta = vdup_n_f32(CAROTENE_ROUND_DELTA);
return vcvt_s32_f32(vsub_f32(vadd_f32(val, delta), delta));
#endif
}
} }
#endif // CAROTENE_NEON
#endif

View File

@ -915,7 +915,15 @@ foreach(hal ${OpenCV_HAL})
if(";${CPU_BASELINE_FINAL};" MATCHES ";NEON;") if(";${CPU_BASELINE_FINAL};" MATCHES ";NEON;")
add_subdirectory(3rdparty/carotene/hal) add_subdirectory(3rdparty/carotene/hal)
ocv_hal_register(CAROTENE_HAL_LIBRARIES CAROTENE_HAL_HEADERS CAROTENE_HAL_INCLUDE_DIRS) ocv_hal_register(CAROTENE_HAL_LIBRARIES CAROTENE_HAL_HEADERS CAROTENE_HAL_INCLUDE_DIRS)
list(APPEND OpenCV_USED_HAL "carotene (ver ${CAROTENE_HAL_VERSION})")
if( NOT DEFINED CAROTENE_NEON_ARCH)
set(CAROTENE_NEON_MSG "Auto detected")
elseif( CAROTENE_NEON_ARCH GREATER 7)
set(CAROTENE_NEON_MSG "Force ARMv8+")
else()
set(CAROTENE_NEON_MSG "Force ARMv7")
endif()
list(APPEND OpenCV_USED_HAL "carotene (ver ${CAROTENE_HAL_VERSION}, ${CAROTENE_NEON_MSG})")
else() else()
message(STATUS "Carotene: NEON is not available, disabling carotene...") message(STATUS "Carotene: NEON is not available, disabling carotene...")
endif() endif()

View File

@ -585,6 +585,7 @@ Following options can be used to change installation layout for common scenarios
| `BUILD_FAT_JAVA_LIB` | _ON_ (for static Android builds) | Build single _opencv_java_ dynamic library containing all library functionality bundled with Java bindings. | | `BUILD_FAT_JAVA_LIB` | _ON_ (for static Android builds) | Build single _opencv_java_ dynamic library containing all library functionality bundled with Java bindings. |
| `BUILD_opencv_python2` | _ON_ | Build python2 bindings (deprecated). Python with development files and numpy must be installed. | | `BUILD_opencv_python2` | _ON_ | Build python2 bindings (deprecated). Python with development files and numpy must be installed. |
| `BUILD_opencv_python3` | _ON_ | Build python3 bindings. Python with development files and numpy must be installed. | | `BUILD_opencv_python3` | _ON_ | Build python3 bindings. Python with development files and numpy must be installed. |
| `CAROTENE_NEON_ARCH` | '(auto)' | Switch NEON Arch for Carotene. If it sets nothing, it will be auto-detected. If it sets 8, ARMv8(and later) is used. Otherwise, ARMv7 is used. |
TODO: need separate tutorials covering bindings builds TODO: need separate tutorials covering bindings builds

View File

@ -1990,11 +1990,9 @@ inline v_int32x4 v_round(const v_float32x4& a)
#else #else
inline v_int32x4 v_round(const v_float32x4& a) inline v_int32x4 v_round(const v_float32x4& a)
{ {
static const int32x4_t v_sign = vdupq_n_s32(1 << 31), // See https://github.com/opencv/opencv/pull/24271#issuecomment-1867318007
v_05 = vreinterpretq_s32_f32(vdupq_n_f32(0.5f)); float32x4_t delta = vdupq_n_f32(12582912.0f);
return v_int32x4(vcvtq_s32_f32(vsubq_f32(vaddq_f32(a.val, delta), delta)));
int32x4_t v_addition = vorrq_s32(v_05, vandq_s32(v_sign, vreinterpretq_s32_f32(a.val)));
return v_int32x4(vcvtq_s32_f32(vaddq_f32(a.val, vreinterpretq_f32_s32(v_addition))));
} }
#endif #endif
inline v_int32x4 v_floor(const v_float32x4& a) inline v_int32x4 v_floor(const v_float32x4& a)

View File

@ -1570,4 +1570,54 @@ TEST(Core_Arithm, scalar_handling_19599) // https://github.com/opencv/opencv/is
EXPECT_EQ(1, c.rows); EXPECT_EQ(1, c.rows);
} }
// https://github.com/opencv/opencv/issues/24163
typedef tuple<perf::MatDepth,int,int,int> Arith_Regression24163Param;
typedef testing::TestWithParam<Arith_Regression24163Param> Core_Arith_Regression24163;
#if defined __riscv
TEST_P(Core_Arith_Regression24163, DISABLED_test_for_ties_to_even)
#else
TEST_P(Core_Arith_Regression24163, test_for_ties_to_even)
#endif
{
const int matDepth = get<0>(GetParam());
const int matHeight= get<1>(GetParam());
const int matWidth = 3; // Fixed
const int alpha = get<2>(GetParam());
const int beta = get<3>(GetParam());
// If alpha and/or beta are negative, and matDepth is unsigned, test is passed.
if( ( (alpha < 0) || (beta < 0) )
&&
( (matDepth != CV_8S) && (matDepth != CV_16S) && (matDepth != CV_32S) ) )
{
throw SkipTestException( cv::format("Test is skipped(matDepth is not signed, alpha = %d, beta = %d)", alpha, beta) );
}
const int matType = CV_MAKE_TYPE(matDepth, 1);
const Size matSize(matWidth, matHeight);
const Mat src1(matSize, matType, Scalar(alpha,alpha,alpha,alpha));
const Mat src2(matSize, matType, Scalar(beta, beta, beta, beta));
const Mat result = ( src1 + src2 ) / 2;
// Expected that default is FE_TONEAREST(Ties to Even).
const int mean = lrint( static_cast<double>(alpha + beta) / 2.0 );
const Mat expected(matSize, matType, Scalar(mean,mean,mean,mean));
// Compare result and extected.
ASSERT_EQ(expected.size(), result.size());
EXPECT_EQ(0, cvtest::norm(expected, result, NORM_INF)) <<
"result=" << std::endl << result << std::endl <<
"expected=" << std::endl << expected;
}
INSTANTIATE_TEST_CASE_P(/* */, Core_Arith_Regression24163,
testing::Combine(
testing::Values(perf::MatDepth(CV_8U), CV_8S, CV_16U, CV_16S, CV_32S), // MatType
testing::Values( 3, 4, 5, 6), // MatHeight
testing::Values(-2,-1, 0, 1, 2), // src1
testing::Values( -1, 0, 1 ) // src2
)
);
}} // namespace }} // namespace